diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..9b3a9b9
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,35 @@
+# CHANGELOG
+
+All notable changes to `dae-cpp` project will be documented in this file.
+
+## [`develop` branch -- 2.3.0]
+
+### Added
+
+- Option `linear_system_scaling` (can be `true` or `false`) to rescale linear system matrix for better convergence (`false` by default).
+- Linear system matrix (row) scaling. Matrix scaling can improve stability of the linear solver in some cases but comes with a slight performance penalty.
+- Linear system matrix prune if scaling is enabled.
+
+### Fixed
+
+- The solver cannot reach relative tolerance `rtol` in some cases and stops with the "solution diverged" error (fixed by updating internal tolerances).
+- Incompatibility of `autodiff` library with the latest version of Eigen (fixed by altering `autodiff`).
+- Typo in `daecpp::solver_command::stop_integration` enum (`stop_intergration` -> `stop_integration`).
+
+### Changed
+
+- Eigen to version 5.0.0.
+- Renamed `TESTING` macro definition to `DAECPP_TESTING` to avoid potential clash.
+- Linear system matrix pattern now analysed only once at the first iteration.
+- Pre-allocate vector of `dual` numbers in `JacobianMatrixShape` class to improve performance of the Jacobian computed from the user-defined shape.
+- Updated internal tolerances used in the solver for the convergence check against relative tolerance `rtol`.
+
+### Removed
+
+- Conversion from Eigen to dae-cpp matrix format and back in automatic Jacobian class (an attempt to speed it up).
+
+## [2.2.0]
+
+Current stable version.
+
+[CHANGELOG](https://dae-cpp.github.io/CHANGELOG.html)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1ecf0d..0d21121 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,18 +7,16 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 ######## Build examples ########
 
-set(EXAMPLE_LIST "quick_start" "simple_dae" "perovskite_model" "flame_propagation" "jacobian_shape" "jacobian_compare")
-
 include_directories(${PROJECT_SOURCE_DIR})
 
-foreach(EXAMPLE_NAME ${EXAMPLE_LIST})
-
-  FILE(GLOB SOURCES ${PROJECT_SOURCE_DIR}/examples/${EXAMPLE_NAME}/*.cpp)
-
-  add_executable(${EXAMPLE_NAME} ${SOURCES})
-
-  install(TARGETS ${EXAMPLE_NAME} DESTINATION bin)
+file(GLOB EXAMPLE_DIRS RELATIVE ${PROJECT_SOURCE_DIR}/examples ${PROJECT_SOURCE_DIR}/examples/*)
 
+foreach(EXAMPLE_NAME ${EXAMPLE_DIRS})
+  if(IS_DIRECTORY ${PROJECT_SOURCE_DIR}/examples/${EXAMPLE_NAME})
+    file(GLOB SOURCES ${PROJECT_SOURCE_DIR}/examples/${EXAMPLE_NAME}/*.cpp)
+    add_executable(${EXAMPLE_NAME} ${SOURCES})
+    install(TARGETS ${EXAMPLE_NAME} DESTINATION bin)
+  endif()
 endforeach()
 
 ######## Build tests ########
@@ -28,7 +26,7 @@ set(PROJECT_TEST "dae-cpp-test")
 include(CTest)
 enable_testing()
 
-FILE(GLOB SOURCES_TEST ${PROJECT_SOURCE_DIR}/tests/test_*.cpp)
+file(GLOB SOURCES_TEST ${PROJECT_SOURCE_DIR}/tests/test_*.cpp)
 
 add_executable(${PROJECT_TEST} ${SOURCES_TEST})
 
@@ -44,7 +42,7 @@ include_directories(${PROJECT_SOURCE_DIR}/googletest/googletest/include)
 
 target_link_libraries(${PROJECT_TEST} gtest_main gtest)
 
-target_compile_definitions(${PROJECT_TEST} PRIVATE "TESTING")
+target_compile_definitions(${PROJECT_TEST} PRIVATE "DAECPP_TESTING")
 
 include(GoogleTest)
 gtest_discover_tests(${PROJECT_TEST})
diff --git a/Eigen/COPYING.APACHE b/Eigen/COPYING.APACHE
deleted file mode 100644
index 61e948d..0000000
--- a/Eigen/COPYING.APACHE
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
\ No newline at end of file
diff --git a/Eigen/COPYING.BSD b/Eigen/COPYING.BSD
deleted file mode 100644
index 8964ddf..0000000
--- a/Eigen/COPYING.BSD
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
diff --git a/Eigen/COPYING.MINPACK b/Eigen/COPYING.MINPACK
deleted file mode 100644
index 132cc3f..0000000
--- a/Eigen/COPYING.MINPACK
+++ /dev/null
@@ -1,51 +0,0 @@
-Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
-   "This product includes software developed by the
-   University of Chicago, as Operator of Argonne National
-   Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
diff --git a/Eigen/COPYING.README b/Eigen/COPYING.README
deleted file mode 100644
index 11af93c..0000000
--- a/Eigen/COPYING.README
+++ /dev/null
@@ -1,6 +0,0 @@
-Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
-  http://www.mozilla.org/MPL/2.0/
-  http://www.mozilla.org/MPL/2.0/FAQ.html
-
-Some files contain third-party code under BSD or other MPL2-compatible licenses,
-whence the other COPYING.* files here.
\ No newline at end of file
diff --git a/Eigen/Core b/Eigen/Core
index f9d9974..4ce2ed9 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -11,6 +11,9 @@
 #ifndef EIGEN_CORE_MODULE_H
 #define EIGEN_CORE_MODULE_H
 
+// Eigen version information.
+#include "Version"
+
 // first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 
@@ -92,11 +95,22 @@
 #include <algorithm>
 
 #include <array>
+#include <memory>
 #include <vector>
 
 // for std::is_nothrow_move_assignable
 #include <type_traits>
 
+// for std::this_thread::yield().
+#if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
+#include <thread>
+#endif
+
+// for std::bit_cast()
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+#include <bit>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@@ -116,9 +130,8 @@
 #undef isfinite
 #include <CL/sycl.hpp>
 #include <map>
-#include <memory>
-#include <utility>
 #include <thread>
+#include <utility>
 #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
 #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
 #endif
@@ -178,6 +191,7 @@ using std::ptrdiff_t;
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
+#include "src/Core/RandomImpl.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
@@ -187,38 +201,52 @@ using std::ptrdiff_t;
 #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
+#include "src/Core/arch/AVX512/PacketMath.h"
+#include "src/Core/arch/AVX512/Reductions.h"
 #if defined EIGEN_VECTORIZE_AVX512FP16
 #include "src/Core/arch/AVX512/PacketMathFP16.h"
 #endif
-#include "src/Core/arch/SSE/PacketMath.h"
 #include "src/Core/arch/SSE/TypeCasting.h"
-#include "src/Core/arch/SSE/Complex.h"
-#include "src/Core/arch/AVX/PacketMath.h"
 #include "src/Core/arch/AVX/TypeCasting.h"
-#include "src/Core/arch/AVX/Complex.h"
-#include "src/Core/arch/AVX512/PacketMath.h"
 #include "src/Core/arch/AVX512/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/TypeCastingFP16.h"
+#endif
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/Complex.h"
 #include "src/Core/arch/AVX512/Complex.h"
 #include "src/Core/arch/SSE/MathFunctions.h"
 #include "src/Core/arch/AVX/MathFunctions.h"
 #include "src/Core/arch/AVX512/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/MathFunctionsFP16.h"
+#endif
 #include "src/Core/arch/AVX512/TrsmKernel.h"
 #elif defined EIGEN_VECTORIZE_AVX
-   // Use AVX for floats and doubles, SSE for integers
+// Use AVX for floats and doubles, SSE for integers
 #include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
 #include "src/Core/arch/SSE/TypeCasting.h"
 #include "src/Core/arch/SSE/Complex.h"
 #include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
 #include "src/Core/arch/AVX/TypeCasting.h"
 #include "src/Core/arch/AVX/Complex.h"
 #include "src/Core/arch/SSE/MathFunctions.h"
 #include "src/Core/arch/AVX/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_SSE
 #include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
 #include "src/Core/arch/SSE/TypeCasting.h"
 #include "src/Core/arch/SSE/MathFunctions.h"
 #include "src/Core/arch/SSE/Complex.h"
-#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+#endif
+
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
 #include "src/Core/arch/AltiVec/PacketMath.h"
 #include "src/Core/arch/AltiVec/TypeCasting.h"
 #include "src/Core/arch/AltiVec/MathFunctions.h"
@@ -228,6 +256,11 @@ using std::ptrdiff_t;
 #include "src/Core/arch/NEON/TypeCasting.h"
 #include "src/Core/arch/NEON/MathFunctions.h"
 #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_LSX
+#include "src/Core/arch/LSX/PacketMath.h"
+#include "src/Core/arch/LSX/TypeCasting.h"
+#include "src/Core/arch/LSX/MathFunctions.h"
+#include "src/Core/arch/LSX/Complex.h"
 #elif defined EIGEN_VECTORIZE_SVE
 #include "src/Core/arch/SVE/PacketMath.h"
 #include "src/Core/arch/SVE/TypeCasting.h"
@@ -294,11 +327,8 @@ using std::ptrdiff_t;
 #include "src/Core/Product.h"
 #include "src/Core/CoreEvaluators.h"
 #include "src/Core/AssignEvaluator.h"
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN  // work around Doxygen bug triggered by Assign.h r814874
-                                 // at least confirmed with Doxygen 1.5.5 and 1.5.6
+#include "src/Core/RealView.h"
 #include "src/Core/Assign.h"
-#endif
 
 #include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
@@ -312,12 +342,14 @@ using std::ptrdiff_t;
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/Fill.h"
 #include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
 #include "src/Core/CwiseUnaryView.h"
 #include "src/Core/SelfCwiseBinaryOp.h"
+#include "src/Core/InnerProduct.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
 #include "src/Core/Stride.h"
@@ -335,6 +367,7 @@ using std::ptrdiff_t;
 #include "src/Core/SkewSymmetricMatrix3.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
+#include "src/Core/FindCoeff.h"
 #include "src/Core/Fuzzy.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
@@ -347,6 +380,7 @@ using std::ptrdiff_t;
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
+#include "src/Core/DeviceWrapper.h"
 #ifdef EIGEN_GEMM_THREADPOOL
 #include "ThreadPool"
 #endif
@@ -372,6 +406,8 @@ using std::ptrdiff_t;
 #include "src/Core/arch/AltiVec/MatrixProduct.h"
 #elif defined EIGEN_VECTORIZE_NEON
 #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+#elif defined EIGEN_VECTORIZE_LSX
+#include "src/Core/arch/LSX/GeneralBlockPanelKernel.h"
 #endif
 
 #if defined(EIGEN_VECTORIZE_AVX512)
diff --git a/Eigen/Geometry b/Eigen/Geometry
index 3334874..efe3e1f 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -22,13 +22,11 @@
  *  - fixed-size homogeneous transformations
  *  - translation, scaling, 2D and 3D rotations
  *  - \link Quaternion quaternions \endlink
- *  - cross products (\ref MatrixBase::cross, \ref MatrixBase::cross3)
- *  - orthognal vector generation (\ref MatrixBase::unitOrthogonal)
- *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes
- * \endlink
+ *  - cross products (\ref MatrixBase::cross(), \ref MatrixBase::cross3())
+ *  - orthogonal vector generation (MatrixBase::unitOrthogonal)
+ *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
  *  - \link AlignedBox axis aligned bounding boxes \endlink
- *  - \link umeyama least-square transformation fitting \endlink
- *
+ *  - \link umeyama() least-square transformation fitting \endlink
  * \code
  * #include <Eigen/Geometry>
  * \endcode
diff --git a/Eigen/COPYING.MPL2 b/Eigen/LICENSE
similarity index 99%
rename from Eigen/COPYING.MPL2
rename to Eigen/LICENSE
index ee6256c..a612ad9 100644
--- a/Eigen/COPYING.MPL2
+++ b/Eigen/LICENSE
@@ -35,7 +35,7 @@ Mozilla Public License Version 2.0
     means any form of the work other than Source Code Form.
 
 1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
+    means a work that combines Covered Software with other material, in
     a separate file or files, that is not Covered Software.
 
 1.8. "License"
@@ -357,7 +357,7 @@ Exhibit A - Source Code Form License Notice
 
   This Source Code Form is subject to the terms of the Mozilla Public
   License, v. 2.0. If a copy of the MPL was not distributed with this
-  file, You can obtain one at https://mozilla.org/MPL/2.0/.
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 If it is not possible or desirable to put the notice in a particular
 file, then You may include the notice in a location (such as a LICENSE
diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods
index 921b8a0..0167419 100644
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -54,7 +54,7 @@
  * \note Some of these methods (like AMD or METIS), need the sparsity pattern
  * of the input matrix to be symmetric. When the matrix is structurally unsymmetric,
  * Eigen computes internally the pattern of \f$A^T*A\f$ before calling the method.
- * If your matrix is already symmetric (at leat in structure), you can avoid that
+ * If your matrix is already symmetric (at least in structure), you can avoid that
  * by calling the method with a SelfAdjointView type.
  *
  * \code
diff --git a/Eigen/ThreadPool b/Eigen/ThreadPool
index febb187..39e5d1e 100644
--- a/Eigen/ThreadPool
+++ b/Eigen/ThreadPool
@@ -71,6 +71,8 @@
 #include "src/ThreadPool/ThreadEnvironment.h"
 #include "src/ThreadPool/Barrier.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
+#include "src/ThreadPool/CoreThreadPoolDevice.h"
+#include "src/ThreadPool/ForkJoin.h"
 // IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Version b/Eigen/Version
new file mode 100644
index 0000000..91936c2
--- /dev/null
+++ b/Eigen/Version
@@ -0,0 +1,14 @@
+#ifndef EIGEN_VERSION_H
+#define EIGEN_VERSION_H
+
+// The "WORLD" version will forever remain "3" for the "Eigen3" library.
+#define EIGEN_WORLD_VERSION 3
+// As of Eigen3 5.0.0, we have moved to Semantic Versioning (semver.org).
+#define EIGEN_MAJOR_VERSION 5
+#define EIGEN_MINOR_VERSION 0
+#define EIGEN_PATCH_VERSION 0
+#define EIGEN_PRERELEASE_VERSION ""
+#define EIGEN_BUILD_VERSION ""
+#define EIGEN_VERSION_STRING "5.0.0"
+
+#endif  // EIGEN_VERSION_H
diff --git a/Eigen/commit-d26e19714fca2faf544619c3604f88d980e5a207 b/Eigen/commit-549bf8c75b6aae071cde2f28aa48f16ee3ae60b0
similarity index 100%
rename from Eigen/commit-d26e19714fca2faf544619c3604f88d980e5a207
rename to Eigen/commit-549bf8c75b6aae071cde2f28aa48f16ee3ae60b0
diff --git a/Eigen/src/AccelerateSupport/AccelerateSupport.h b/Eigen/src/AccelerateSupport/AccelerateSupport.h
index 09967ff..13a26df 100644
--- a/Eigen/src/AccelerateSupport/AccelerateSupport.h
+++ b/Eigen/src/AccelerateSupport/AccelerateSupport.h
@@ -11,7 +11,7 @@ template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool E
 class AccelerateImpl;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateLLT
+ * \typedef AccelerateLLT
  * \brief A direct Cholesky (LLT) factorization and solver based on Accelerate
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
@@ -25,7 +25,7 @@ template <typename MatrixType, int UpLo = Lower>
 using AccelerateLLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationCholesky, true>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateLDLT
+ * \typedef AccelerateLDLT
  * \brief The default Cholesky (LDLT) factorization and solver based on Accelerate
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
@@ -39,7 +39,7 @@ template <typename MatrixType, int UpLo = Lower>
 using AccelerateLDLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLT, true>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateLDLTUnpivoted
+ * \typedef AccelerateLDLTUnpivoted
  * \brief A direct Cholesky-like LDL^T factorization and solver based on Accelerate with only 1x1 pivots and no pivoting
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
@@ -53,7 +53,7 @@ template <typename MatrixType, int UpLo = Lower>
 using AccelerateLDLTUnpivoted = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTUnpivoted, true>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateLDLTSBK
+ * \typedef AccelerateLDLTSBK
  * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with Supernode Bunch-Kaufman and static
  * pivoting
  *
@@ -68,7 +68,7 @@ template <typename MatrixType, int UpLo = Lower>
 using AccelerateLDLTSBK = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTSBK, true>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateLDLTTPP
+ * \typedef AccelerateLDLTTPP
  * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with full threshold partial pivoting
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
@@ -82,7 +82,7 @@ template <typename MatrixType, int UpLo = Lower>
 using AccelerateLDLTTPP = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTTPP, true>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateQR
+ * \typedef AccelerateQR
  * \brief A QR factorization and solver based on Accelerate
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
@@ -95,7 +95,7 @@ template <typename MatrixType>
 using AccelerateQR = AccelerateImpl<MatrixType, 0, SparseFactorizationQR, false>;
 
 /** \ingroup AccelerateSupport_Module
- * \class AccelerateCholeskyAtA
+ * \typedef AccelerateCholeskyAtA
  * \brief A QR factorization and solver based on Accelerate without storing Q (equivalent to A^TA = R^T R)
  *
  * \warning Only single and double precision real scalar types are supported by Accelerate
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 5d52ab2..b1d801d 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -230,8 +230,8 @@ class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
    */
   const LDLT& adjoint() const { return *this; }
 
-  EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 01b4476..7fa4fa2 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -182,10 +182,10 @@ class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
    * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
    * \code x = decomposition.adjoint().solve(b) \endcode
    */
-  const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }
+  const LLT& adjoint() const noexcept { return *this; }
 
-  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   template <typename VectorType>
   LLT& rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index e5b46c4..7e3c881 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -425,7 +425,7 @@ class CholmodBase : public SparseSolverBase<Derived> {
     RealScalar logDet = 0;
     Scalar* x = static_cast<Scalar*>(m_cholmodFactor->x);
     if (m_cholmodFactor->is_super) {
-      // Supernodal factorization stored as a packed list of dense column-major blocs,
+      // Supernodal factorization stored as a packed list of dense column-major blocks,
       // as described by the following structure:
 
       // super[k] == index of the first column of the j-th super node
diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
index ae3fac3..ae6373d 100644
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -61,26 +61,28 @@ seqN(FirstType first, SizeType size, IncrType incr);
 template <typename FirstType, typename SizeType, typename IncrType>
 class ArithmeticSequence {
  public:
-  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
-  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
+  constexpr ArithmeticSequence() = default;
+  constexpr ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
+  constexpr ArithmeticSequence(FirstType first, SizeType size, IncrType incr)
+      : m_first(first), m_size(size), m_incr(incr) {}
 
   enum {
-    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
+    // SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
     IncrAtCompileTime = internal::get_fixed_value<IncrType, DynamicIndex>::value
   };
 
   /** \returns the size, i.e., number of elements, of the sequence */
-  Index size() const { return m_size; }
+  constexpr Index size() const { return m_size; }
 
   /** \returns the first element \f$ a_0 \f$ in the sequence */
-  Index first() const { return m_first; }
+  constexpr Index first() const { return m_first; }
 
   /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
-  Index operator[](Index i) const { return m_first + i * m_incr; }
+  constexpr Index operator[](Index i) const { return m_first + i * m_incr; }
 
-  const FirstType& firstObject() const { return m_first; }
-  const SizeType& sizeObject() const { return m_size; }
-  const IncrType& incrObject() const { return m_incr; }
+  constexpr const FirstType& firstObject() const { return m_first; }
+  constexpr const SizeType& sizeObject() const { return m_size; }
+  constexpr const IncrType& incrObject() const { return m_incr; }
 
  protected:
   FirstType m_first;
@@ -88,7 +90,7 @@ class ArithmeticSequence {
   IncrType m_incr;
 
  public:
-  auto reverse() const -> decltype(Eigen::seqN(m_first + (m_size + fix<-1>()) * m_incr, m_size, -m_incr)) {
+  constexpr auto reverse() const -> decltype(Eigen::seqN(m_first + (m_size + fix<-1>()) * m_incr, m_size, -m_incr)) {
     return seqN(m_first + (m_size + fix<-1>()) * m_incr, m_size, -m_incr);
   }
 };
@@ -201,38 +203,6 @@ auto lastN(SizeType size) -> decltype(seqN(Eigen::placeholders::last + fix<1>()
 
 }  // namespace placeholders
 
-namespace internal {
-
-// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
-template <typename T>
-struct make_size_type {
-  typedef std::conditional_t<symbolic::is_symbolic<T>::value, Index, T> type;
-};
-
-template <typename FirstType, typename SizeType, typename IncrType, int XprSize>
-struct IndexedViewCompatibleType<ArithmeticSequence<FirstType, SizeType, IncrType>, XprSize> {
-  typedef ArithmeticSequence<Index, typename make_size_type<SizeType>::type, IncrType> type;
-};
-
-template <typename FirstType, typename SizeType, typename IncrType>
-ArithmeticSequence<Index, typename make_size_type<SizeType>::type, IncrType> makeIndexedViewCompatible(
-    const ArithmeticSequence<FirstType, SizeType, IncrType>& ids, Index size, SpecializedType) {
-  return ArithmeticSequence<Index, typename make_size_type<SizeType>::type, IncrType>(
-      eval_expr_given_size(ids.firstObject(), size), eval_expr_given_size(ids.sizeObject(), size), ids.incrObject());
-}
-
-template <typename FirstType, typename SizeType, typename IncrType>
-struct get_compile_time_incr<ArithmeticSequence<FirstType, SizeType, IncrType> > {
-  enum { value = get_fixed_value<IncrType, DynamicIndex>::value };
-};
-
-template <typename FirstType, typename SizeType, typename IncrType>
-constexpr Index get_runtime_incr(const ArithmeticSequence<FirstType, SizeType, IncrType>& x) EIGEN_NOEXCEPT {
-  return static_cast<Index>(x.incrObject());
-}
-
-}  // end namespace internal
-
 /** \namespace Eigen::indexing
   * \ingroup Core_Module
   *
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 29c9682..57f3186 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -102,8 +102,13 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
     return Base::_set(other);
   }
 
-  /** This is a special case of the templated operator=. Its purpose is to
-   * prevent a default operator= from hiding the templated operator=.
+  /**
+   * \brief Assigns arrays to each other.
+   *
+   * \note This is a special case of the templated operator=. Its purpose is
+   * to prevent a default operator= from hiding the templated operator=.
+   *
+   * \callgraph
    */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array& operator=(const Array& other) { return Base::_set(other); }
 
@@ -117,25 +122,27 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  // FIXME is it still needed ??
-  /** \internal */
-  EIGEN_DEVICE_FUNC Array(internal::constructor_without_unaligned_array_assert)
-      : Base(internal::constructor_without_unaligned_array_assert()){EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED}
+#ifdef EIGEN_INITIALIZE_COEFFS
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() = default;
 #endif
-
-        EIGEN_DEVICE_FUNC Array(Array && other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
-      : Base(std::move(other)) {
-  }
-  EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
+  EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
   }
 
-  /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const
-   * ArgTypes&... args)
+  /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
+   *
+   * \only_for_vectors
+   *
+   * This constructor is for 1D array or vectors with more than 4 coefficients.
+   *
+   * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   *
    *
    * Example: \include Array_variadic_ctor_cxx11.cpp
    * Output: \verbinclude Array_variadic_ctor_cxx11.out
@@ -232,7 +239,7 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Array& other) : Base(other) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(const Array&) = default;
 
  private:
   struct PrivateType {};
@@ -246,8 +253,8 @@ class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxR
           PrivateType())
       : Base(other.derived()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
 
 #ifdef EIGEN_ARRAY_PLUGIN
 #include EIGEN_ARRAY_PLUGIN
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 6237df4..8465f54 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -81,9 +81,6 @@ class ArrayBase : public DenseBase<Derived> {
 
   typedef typename Base::CoeffReturnType CoeffReturnType;
 
-#endif  // not EIGEN_PARSED_BY_DOXYGEN
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
   typedef typename Base::PlainObject PlainObject;
 
   /** \internal Represents a matrix with all coefficients equal to one another*/
@@ -118,19 +115,57 @@ class ArrayBase : public DenseBase<Derived> {
     return derived();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& scalar);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& scalar);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& other) {
+    internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
+                              internal::add_assign_op<Scalar, Scalar>());
+    return derived();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& other) {
+    internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
+                              internal::sub_assign_op<Scalar, Scalar>());
+    return derived();
+  }
 
+  /** replaces \c *this by \c *this + \a other.
+   *
+   * \returns a reference to \c *this
+   */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
+
+  /** replaces \c *this by \c *this - \a other.
+   *
+   * \returns a reference to \c *this
+   */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
 
+  /** replaces \c *this by \c *this * \a other coefficient wise.
+   *
+   * \returns a reference to \c *this
+   */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
 
+  /** replaces \c *this by \c *this / \a other coefficient wise.
+   *
+   * \returns a reference to \c *this
+   */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
 
  public:
   EIGEN_DEVICE_FUNC ArrayBase<Derived>& array() { return *this; }
@@ -173,50 +208,6 @@ class ArrayBase : public DenseBase<Derived> {
   }
 };
 
-/** replaces \c *this by \c *this - \a other.
- *
- * \returns a reference to \c *this
- */
-template <typename Derived>
-template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived>& other) {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
-  return derived();
-}
-
-/** replaces \c *this by \c *this + \a other.
- *
- * \returns a reference to \c *this
- */
-template <typename Derived>
-template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other) {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
-  return derived();
-}
-
-/** replaces \c *this by \c *this * \a other coefficient wise.
- *
- * \returns a reference to \c *this
- */
-template <typename Derived>
-template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other) {
-  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar, typename OtherDerived::Scalar>());
-  return derived();
-}
-
-/** replaces \c *this by \c *this / \a other coefficient wise.
- *
- * \returns a reference to \c *this
- */
-template <typename Derived>
-template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other) {
-  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar, typename OtherDerived::Scalar>());
-  return derived();
-}
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_ARRAYBASE_H
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index b45395d..c9a194e 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -56,17 +56,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > {
 
   EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
 
   EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
     return m_expression.coeffRef(rowId, colId);
@@ -135,17 +131,13 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > {
 
   EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
 
   EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
     return m_expression.derived().coeffRef(rowId, colId);
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index f7f0b23..36f0a9d 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -27,125 +27,116 @@ namespace internal {
 
 // copy_using_evaluator_traits is based on assign_traits
 
-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = Dynamic>
 struct copy_using_evaluator_traits {
-  typedef typename DstEvaluator::XprType Dst;
-  typedef typename Dst::Scalar DstScalar;
+  using Src = typename SrcEvaluator::XprType;
+  using Dst = typename DstEvaluator::XprType;
+  using DstScalar = typename Dst::Scalar;
 
-  enum { DstFlags = DstEvaluator::Flags, SrcFlags = SrcEvaluator::Flags };
+  static constexpr int DstFlags = DstEvaluator::Flags;
+  static constexpr int SrcFlags = SrcEvaluator::Flags;
 
  public:
-  enum {
-    DstAlignment = DstEvaluator::Alignment,
-    SrcAlignment = SrcEvaluator::Alignment,
-    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
-    JointAlignment = plain_enum_min(DstAlignment, SrcAlignment)
-  };
-
- private:
-  enum {
-    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
-                : int(DstFlags) & RowMajorBit   ? int(Dst::ColsAtCompileTime)
-                                                : int(Dst::RowsAtCompileTime),
-    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
-                   : int(DstFlags) & RowMajorBit   ? int(Dst::MaxColsAtCompileTime)
-                                                   : int(Dst::MaxRowsAtCompileTime),
-    RestrictedInnerSize = min_size_prefer_fixed(InnerSize, MaxPacketSize),
-    RestrictedLinearSize = min_size_prefer_fixed(Dst::SizeAtCompileTime, MaxPacketSize),
-    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
-    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
-  };
+  static constexpr int DstAlignment = DstEvaluator::Alignment;
+  static constexpr int SrcAlignment = SrcEvaluator::Alignment;
+  static constexpr int JointAlignment = plain_enum_min(DstAlignment, SrcAlignment);
+  static constexpr bool DstHasDirectAccess = bool(DstFlags & DirectAccessBit);
+  static constexpr bool SrcIsRowMajor = bool(SrcFlags & RowMajorBit);
+  static constexpr bool DstIsRowMajor = bool(DstFlags & RowMajorBit);
+  static constexpr bool IsVectorAtCompileTime = Dst::IsVectorAtCompileTime;
+  static constexpr int RowsAtCompileTime = size_prefer_fixed(Src::RowsAtCompileTime, Dst::RowsAtCompileTime);
+  static constexpr int ColsAtCompileTime = size_prefer_fixed(Src::ColsAtCompileTime, Dst::ColsAtCompileTime);
+  static constexpr int SizeAtCompileTime = size_at_compile_time(RowsAtCompileTime, ColsAtCompileTime);
+  static constexpr int MaxRowsAtCompileTime =
+      min_size_prefer_fixed(Src::MaxRowsAtCompileTime, Dst::MaxRowsAtCompileTime);
+  static constexpr int MaxColsAtCompileTime =
+      min_size_prefer_fixed(Src::MaxColsAtCompileTime, Dst::MaxColsAtCompileTime);
+  static constexpr int MaxSizeAtCompileTime =
+      min_size_prefer_fixed(Src::MaxSizeAtCompileTime, Dst::MaxSizeAtCompileTime);
+  static constexpr int InnerSizeAtCompileTime = IsVectorAtCompileTime ? SizeAtCompileTime
+                                                : DstIsRowMajor       ? ColsAtCompileTime
+                                                                      : RowsAtCompileTime;
+  static constexpr int MaxInnerSizeAtCompileTime = IsVectorAtCompileTime ? MaxSizeAtCompileTime
+                                                   : DstIsRowMajor       ? MaxColsAtCompileTime
+                                                                         : MaxRowsAtCompileTime;
+  static constexpr int RestrictedInnerSize = min_size_prefer_fixed(MaxInnerSizeAtCompileTime, MaxPacketSize);
+  static constexpr int RestrictedLinearSize = min_size_prefer_fixed(MaxSizeAtCompileTime, MaxPacketSize);
+  static constexpr int OuterStride = outer_stride_at_compile_time<Dst>::ret;
 
   // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar, RestrictedLinearSize>::type LinearPacketType;
-  typedef typename find_best_packet<DstScalar, RestrictedInnerSize>::type InnerPacketType;
+  using LinearPacketType = typename find_best_packet<DstScalar, RestrictedLinearSize>::type;
+  using InnerPacketType = typename find_best_packet<DstScalar, RestrictedInnerSize>::type;
 
-  enum {
-    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
-    InnerPacketSize = unpacket_traits<InnerPacketType>::size
-  };
+  static constexpr int LinearPacketSize = unpacket_traits<LinearPacketType>::size;
+  static constexpr int InnerPacketSize = unpacket_traits<InnerPacketType>::size;
 
  public:
-  enum {
-    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
-    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
-  };
+  static constexpr int LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment;
+  static constexpr int InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment;
 
  private:
-  enum {
-    DstIsRowMajor = DstFlags & RowMajorBit,
-    SrcIsRowMajor = SrcFlags & RowMajorBit,
-    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
-    MightVectorize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) &&
-                     bool(functor_traits<AssignFunc>::PacketAccess),
-    MayInnerVectorize = MightVectorize && int(InnerSize) != Dynamic && int(InnerSize) % int(InnerPacketSize) == 0 &&
-                        int(OuterStride) != Dynamic && int(OuterStride) % int(InnerPacketSize) == 0 &&
-                        (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment) >= int(InnerRequiredAlignment)),
-    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) &&
-                         (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment) >= int(LinearRequiredAlignment)) ||
-                          MaxSizeAtCompileTime == Dynamic),
-    /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-       so it's only good for large enough sizes. */
-    MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) &&
-                        (int(InnerMaxSize) == Dynamic ||
-                         int(InnerMaxSize) >= (EIGEN_UNALIGNED_VECTORIZE ? InnerPacketSize : (3 * InnerPacketSize)))
-    /* slice vectorization can be slow, so we only want it if the slices are big, which is
-       indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-       in a fixed-size matrix
-       However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
-  };
+  static constexpr bool StorageOrdersAgree = DstIsRowMajor == SrcIsRowMajor;
+  static constexpr bool MightVectorize = StorageOrdersAgree && bool(DstFlags & SrcFlags & ActualPacketAccessBit) &&
+                                         bool(functor_traits<AssignFunc>::PacketAccess);
+  static constexpr bool MayInnerVectorize = MightVectorize && (InnerSizeAtCompileTime != Dynamic) &&
+                                            (InnerSizeAtCompileTime % InnerPacketSize == 0) &&
+                                            (OuterStride != Dynamic) && (OuterStride % InnerPacketSize == 0) &&
+                                            (EIGEN_UNALIGNED_VECTORIZE || JointAlignment >= InnerRequiredAlignment);
+  static constexpr bool MayLinearize = StorageOrdersAgree && (DstFlags & SrcFlags & LinearAccessBit);
+  static constexpr bool MayLinearVectorize =
+      MightVectorize && MayLinearize && DstHasDirectAccess &&
+      (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment) || MaxSizeAtCompileTime == Dynamic) &&
+      (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearPacketSize);
+  /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+     so it's only good for large enough sizes. */
+  static constexpr int InnerSizeThreshold = (EIGEN_UNALIGNED_VECTORIZE ? 1 : 3) * InnerPacketSize;
+  static constexpr bool MaySliceVectorize =
+      MightVectorize && DstHasDirectAccess &&
+      (MaxInnerSizeAtCompileTime == Dynamic || MaxInnerSizeAtCompileTime >= InnerSizeThreshold);
+  /* slice vectorization can be slow, so we only want it if the slices are big, which is
+     indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+     in a fixed-size matrix
+     However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
 
  public:
-  enum {
-    Traversal = int(Dst::SizeAtCompileTime) == 0
-                    ? int(AllAtOnceTraversal)  // If compile-size is zero, traversing will fail at compile-time.
-                : (int(MayLinearVectorize) && (LinearPacketSize > InnerPacketSize)) ? int(LinearVectorizedTraversal)
-                : int(MayInnerVectorize)                                            ? int(InnerVectorizedTraversal)
-                : int(MayLinearVectorize)                                           ? int(LinearVectorizedTraversal)
-                : int(MaySliceVectorize)                                            ? int(SliceVectorizedTraversal)
-                : int(MayLinearize)                                                 ? int(LinearTraversal)
-                                                                                    : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal || int(Traversal) == LinearVectorizedTraversal ||
-                 int(Traversal) == SliceVectorizedTraversal
-  };
-
-  typedef std::conditional_t<int(Traversal) == LinearVectorizedTraversal, LinearPacketType, InnerPacketType> PacketType;
+  static constexpr int Traversal = SizeAtCompileTime == 0 ? AllAtOnceTraversal
+                                   : (MayLinearVectorize && (LinearPacketSize > InnerPacketSize))
+                                       ? LinearVectorizedTraversal
+                                   : MayInnerVectorize  ? InnerVectorizedTraversal
+                                   : MayLinearVectorize ? LinearVectorizedTraversal
+                                   : MaySliceVectorize  ? SliceVectorizedTraversal
+                                   : MayLinearize       ? LinearTraversal
+                                                        : DefaultTraversal;
+  static constexpr bool Vectorized = Traversal == InnerVectorizedTraversal || Traversal == LinearVectorizedTraversal ||
+                                     Traversal == SliceVectorizedTraversal;
+
+  using PacketType = std::conditional_t<Traversal == LinearVectorizedTraversal, LinearPacketType, InnerPacketType>;
 
  private:
-  enum {
-    ActualPacketSize = int(Traversal) == LinearVectorizedTraversal ? LinearPacketSize
-                       : Vectorized                                ? InnerPacketSize
-                                                                   : 1,
-    UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
-    MayUnrollCompletely =
-        int(Dst::SizeAtCompileTime) != Dynamic &&
-        int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost)) <=
-            int(UnrollingLimit),
-    MayUnrollInner =
-        int(InnerSize) != Dynamic &&
-        int(InnerSize) * (int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
-  };
+  static constexpr int ActualPacketSize = Vectorized ? unpacket_traits<PacketType>::size : 1;
+  static constexpr int UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize;
+  static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost);
+  static constexpr bool MayUnrollCompletely =
+      (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
+  static constexpr bool MayUnrollInner =
+      (InnerSizeAtCompileTime != Dynamic) && (InnerSizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
 
  public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                    ? (int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                       : int(MayUnrollInner)    ? int(InnerUnrolling)
-                                                : int(NoUnrolling))
-                : int(Traversal) == int(LinearVectorizedTraversal)
-                    ? (bool(MayUnrollCompletely) &&
-                               (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment) >= int(LinearRequiredAlignment)))
-                           ? int(CompleteUnrolling)
-                           : int(NoUnrolling))
-                : int(Traversal) == int(LinearTraversal)
-                    ? (bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling))
+  static constexpr int Unrolling =
+      (Traversal == InnerVectorizedTraversal || Traversal == DefaultTraversal)
+          ? (MayUnrollCompletely ? CompleteUnrolling
+             : MayUnrollInner    ? InnerUnrolling
+                                 : NoUnrolling)
+      : Traversal == LinearVectorizedTraversal
+          ? (MayUnrollCompletely && (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment))
+                 ? CompleteUnrolling
+                 : NoUnrolling)
+      : Traversal == LinearTraversal ? (MayUnrollCompletely ? CompleteUnrolling : NoUnrolling)
 #if EIGEN_UNALIGNED_VECTORIZE
-                : int(Traversal) == int(SliceVectorizedTraversal)
-                    ? (bool(MayUnrollInner) ? int(InnerUnrolling) : int(NoUnrolling))
+      : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
 #endif
-                    : int(NoUnrolling)
-  };
+                                              : NoUnrolling;
+  static constexpr bool UsePacketSegment = has_packet_segment<PacketType>::value;
 
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug() {
@@ -162,8 +153,8 @@ struct copy_using_evaluator_traits {
     EIGEN_DEBUG_VAR(LinearRequiredAlignment)
     EIGEN_DEBUG_VAR(InnerRequiredAlignment)
     EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(InnerSizeAtCompileTime)
+    EIGEN_DEBUG_VAR(MaxInnerSizeAtCompileTime)
     EIGEN_DEBUG_VAR(LinearPacketSize)
     EIGEN_DEBUG_VAR(InnerPacketSize)
     EIGEN_DEBUG_VAR(ActualPacketSize)
@@ -196,28 +187,25 @@ struct copy_using_evaluator_traits {
 *** Default traversal ***
 ************************/
 
-template <typename Kernel, int Index, int Stop>
+template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling {
-  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-
-  enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime };
+  static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    kernel.assignCoeffByOuterInner(outer, inner);
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index + 1, Stop>::run(kernel);
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    kernel.assignCoeffByOuterInner(Outer, Inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
   }
 };
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer) {
     kernel.assignCoeffByOuterInner(outer, Index_);
     copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_ + 1, Stop>::run(kernel, outer);
   }
@@ -225,62 +213,57 @@ struct copy_using_evaluator_DefaultTraversal_InnerUnrolling {
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
 };
 
 /***********************
 *** Linear traversal ***
 ***********************/
 
-template <typename Kernel, int Index, int Stop>
+template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    kernel.assignCoeff(Index);
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index + 1, Stop>::run(kernel);
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    kernel.assignCoeff(Index_);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
   }
 };
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 /**************************
 *** Inner vectorization ***
 **************************/
 
-template <typename Kernel, int Index, int Stop>
+template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling {
-  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-  typedef typename Kernel::PacketType PacketType;
-
-  enum {
-    outer = Index / DstXprType::InnerSizeAtCompileTime,
-    inner = Index % DstXprType::InnerSizeAtCompileTime,
-    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
-    DstAlignment = Kernel::AssignmentTraits::DstAlignment
-  };
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
-    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(Outer, Inner);
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
   }
 };
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling {
-  typedef typename Kernel::PacketType PacketType;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
     kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
-    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
     copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel,
                                                                                                            outer);
   }
@@ -288,7 +271,34 @@ struct copy_using_evaluator_innervec_InnerUnrolling {
 
 template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
+struct copy_using_evaluator_innervec_segment {
+  using PacketType = typename Kernel::PacketType;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+    kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
+                                                                                            Stop - Start);
+  }
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false>
+    : copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
 };
 
 /***************************************************************************
@@ -299,7 +309,21 @@ struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlign
 
 template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
           int Unrolling = Kernel::AssignmentTraits::Unrolling>
-struct dense_assignment_loop;
+struct dense_assignment_loop_impl;
+
+template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+#ifdef __cpp_lib_is_constant_evaluated
+    if (internal::is_constant_evaluated())
+      dense_assignment_loop_impl<Kernel, Traversal == AllAtOnceTraversal ? AllAtOnceTraversal : DefaultTraversal,
+                                 NoUnrolling>::run(kernel);
+    else
+#endif
+      dense_assignment_loop_impl<Kernel, Traversal, Unrolling>::run(kernel);
+  }
+};
 
 /************************
 ***** Special Cases *****
@@ -307,10 +331,11 @@ struct dense_assignment_loop;
 
 // Zero-sized assignment is a no-op.
 template <typename Kernel, int Unrolling>
-struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling> {
-  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) {
-    EIGEN_STATIC_ASSERT(int(Kernel::DstEvaluatorType::XprType::SizeAtCompileTime) == 0,
-                        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& /*kernel*/) {
+    EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
   }
 };
 
@@ -319,8 +344,8 @@ struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling> {
 ************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& kernel) {
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, NoUnrolling> {
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& kernel) {
     for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
       for (Index inner = 0; inner < kernel.innerSize(); ++inner) {
         kernel.assignCoeffByOuterInner(outer, inner);
@@ -330,22 +355,22 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling> {
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, CompleteUnrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, InnerUnrolling> {
+  static constexpr int InnerSizeAtCompileTime = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
 
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index outerSize = kernel.outerSize();
     for (Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel,
-                                                                                                               outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSizeAtCompileTime>::run(kernel, outer);
   }
 };
 
@@ -356,92 +381,134 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> {
 // The goal of unaligned_dense_assignment_loop is simply to factorize the handling
 // of the non vectorizable beginning and ending parts
 
-template <bool IsAligned = false>
+template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
 struct unaligned_dense_assignment_loop {
-  // if IsAligned = true, then do nothing
+  // if Skip == true, then do nothing
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*start*/, Index /*end*/) {}
   template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*outer*/,
+                                                                  Index /*innerStart*/, Index /*innerEnd*/) {}
 };
 
-template <>
-struct unaligned_dense_assignment_loop<false> {
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-  // FIXME check which version exhibits this issue
-#if EIGEN_COMP_MSVC
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
+                                       /*Skip*/ false> {
   template <typename Kernel>
-  static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end)
-#else
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
+  }
   template <typename Kernel>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end)
-#endif
-  {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0)
+      kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
+  }
+};
+
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
+                                       /*Skip*/ false> {
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
     for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
   }
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index innerStart,
+                                                                  Index innerEnd) {
+    for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+  }
 };
 
-template <typename Kernel, int Index, int Stop>
+template <typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_linearvec_CompleteUnrolling {
-  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-  typedef typename Kernel::PacketType PacketType;
-
-  enum { SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, DstAlignment = Kernel::AssignmentTraits::DstAlignment };
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(Index);
-    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
+    kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(Index_);
     copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
   }
 };
 
 template <typename Kernel, int Stop>
 struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
+struct copy_using_evaluator_linearvec_segment {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
+  }
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
+    : copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+  static constexpr bool Alignable =
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index size = kernel.size();
-    typedef typename Kernel::Scalar Scalar;
-    typedef typename Kernel::PacketType PacketType;
-    enum {
-      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
-      packetSize = unpacket_traits<PacketType>::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment) >= int(requestedAlignment),
-      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
-                                                            : int(Kernel::AssignmentTraits::DstAlignment),
-      srcAlignment = Kernel::AssignmentTraits::JointAlignment
-    };
-    const Index alignedStart =
-        dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
-    const Index alignedEnd = alignedStart + ((size - alignedStart) / packetSize) * packetSize;
+    const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
 
-    unaligned_dense_assignment_loop<dstIsAligned != 0>::run(kernel, 0, alignedStart);
+    head_loop::run(kernel, 0, alignedStart);
 
-    for (Index index = alignedStart; index < alignedEnd; index += packetSize)
-      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
+    for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
+      kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
 
-    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+    tail_loop::run(kernel, alignedEnd, size);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    typedef typename Kernel::PacketType PacketType;
-
-    enum {
-      size = DstXprType::SizeAtCompileTime,
-      packetSize = unpacket_traits<PacketType>::size,
-      alignedSize = (int(size) / packetSize) * packetSize
-    };
-
-    copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
+  static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
+    copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
   }
 };
 
@@ -450,36 +517,41 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
 **************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> {
-  typedef typename Kernel::PacketType PacketType;
-  enum { SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, DstAlignment = Kernel::AssignmentTraits::DstAlignment };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, NoUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
-    const Index packetSize = unpacket_traits<PacketType>::size;
     for (Index outer = 0; outer < outerSize; ++outer)
-      for (Index inner = 0; inner < innerSize; inner += packetSize)
+      for (Index inner = 0; inner < innerSize; inner += PacketSize)
         kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
+  static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    typedef typename Kernel::AssignmentTraits Traits;
     const Index outerSize = kernel.outerSize();
     for (Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime, Traits::SrcAlignment,
-                                                   Traits::DstAlignment>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(kernel,
+                                                                                                          outer);
   }
 };
 
@@ -488,18 +560,18 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
 ***********************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Index size = kernel.size();
     for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
   }
 };
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
+        kernel);
   }
 };
 
@@ -508,64 +580,62 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> {
 ***************************/
 
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
-    typedef typename Kernel::Scalar Scalar;
-    typedef typename Kernel::PacketType PacketType;
-    enum {
-      packetSize = unpacket_traits<PacketType>::size,
-      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
-      alignable =
-          packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment) >= sizeof(Scalar),
-      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment) >= int(requestedAlignment),
-      dstAlignment = alignable ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment)
-    };
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+  static constexpr bool Alignable =
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     const Scalar* dst_ptr = kernel.dstDataPtr();
-    if ((!bool(dstIsAligned)) && (std::uintptr_t(dst_ptr) % sizeof(Scalar)) > 0) {
-      // the pointer is not aligned-on scalar, so alignment is not possible
-      return dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>::run(kernel);
-    }
-    const Index packetAlignedMask = packetSize - 1;
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
-    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart =
-        ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
+    const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
+    Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
 
     for (Index outer = 0; outer < outerSize; ++outer) {
-      const Index alignedEnd = alignedStart + ((innerSize - alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for (Index inner = 0; inner < alignedStart; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+      const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
+
+      head_loop::run(kernel, outer, 0, alignedStart);
 
       // do the vectorizable part of the assignment
-      for (Index inner = alignedStart; inner < alignedEnd; inner += packetSize)
-        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
+      for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
+        kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
 
-      // do the non-vectorizable part of the assignment
-      for (Index inner = alignedEnd; inner < innerSize; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+      tail_loop::run(kernel, outer, alignedEnd, innerSize);
 
-      alignedStart = numext::mini((alignedStart + alignedStep) % packetSize, innerSize);
+      alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
     }
   }
 };
 
 #if EIGEN_UNALIGNED_VECTORIZE
 template <typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    typedef typename Kernel::PacketType PacketType;
-
-    enum {
-      innerSize = DstXprType::InnerSizeAtCompileTime,
-      packetSize = unpacket_traits<PacketType>::size,
-      vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
-      size = DstXprType::SizeAtCompileTime
-    };
-
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
+  using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
+                                                                    Unaligned, UsePacketSegment>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
     for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
+      packet_loop::run(kernel, outer);
+      packet_segment_loop::run(kernel, outer);
     }
   }
 };
@@ -594,27 +664,28 @@ class generic_dense_assignment_kernel {
   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
   typedef typename AssignmentTraits::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorType& dst,
-                                                                        const SrcEvaluatorType& src,
-                                                                        const Functor& func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr generic_dense_assignment_kernel(DstEvaluatorType& dst,
+                                                                                  const SrcEvaluatorType& src,
+                                                                                  const Functor& func,
+                                                                                  DstXprType& dstExpr)
       : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) {
 #ifdef EIGEN_DEBUG_ASSIGN
     AssignmentTraits::debug();
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const noexcept { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const noexcept { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
 
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
 
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
     m_functor.assignCoeff(m_dst.coeffRef(row, col), m_src.coeff(row, col));
   }
 
@@ -624,7 +695,7 @@ class generic_dense_assignment_kernel {
   }
 
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeffByOuterInner(Index outer, Index inner) {
     Index row = rowIndexByOuterInner(outer, inner);
     Index col = colIndexByOuterInner(outer, inner);
     assignCoeff(row, col);
@@ -648,7 +719,28 @@ class generic_dense_assignment_kernel {
     assignPacket<StoreMode, LoadMode, Packet>(row, col);
   }
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) {
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
+        count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1          ? 0
            : int(Traits::ColsAtCompileTime) == 1        ? inner
@@ -656,7 +748,7 @@ class generic_dense_assignment_kernel {
                                                         : inner;
   }
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index colIndexByOuterInner(Index outer, Index inner) {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1          ? 0
            : int(Traits::RowsAtCompileTime) == 1        ? inner
@@ -700,16 +792,16 @@ class restricted_packet_dense_assignment_kernel
  ***************************************************************************/
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
-                                                             const Functor& /*func*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const Functor& /*func*/) {
   EIGEN_ONLY_USED_FOR_DEBUG(dst);
   EIGEN_ONLY_USED_FOR_DEBUG(src);
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
 }
 
 template <typename DstXprType, typename SrcXprType, typename T1, typename T2>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
-                                                             const internal::assign_op<T1, T2>& /*func*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const internal::assign_op<T1, T2>& /*func*/) {
   Index dstRows = src.rows();
   Index dstCols = src.cols();
   if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) dst.resize(dstRows, dstCols);
@@ -717,9 +809,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize_if_allowed(DstXprType& dst, co
 }
 
 template <typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst,
-                                                                                      const SrcXprType& src,
-                                                                                      const Functor& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func) {
   typedef evaluator<DstXprType> DstEvaluatorType;
   typedef evaluator<SrcXprType> SrcEvaluatorType;
 
@@ -737,18 +828,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
-// Specialization for filling the destination with a constant value.
-#ifndef EIGEN_GPU_COMPILE_PHASE
-template <typename DstXprType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(
-    DstXprType& dst,
-    const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src,
-    const internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>& func) {
-  resize_if_allowed(dst, src, func);
-  std::fill_n(dst.data(), dst.size(), src.functor()());
-}
-#endif
-
 template <typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>());
@@ -790,7 +869,7 @@ struct Assignment;
 // not has to bother about these annoying details.
 
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(Dst& dst, const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(Dst& dst, const Src& src) {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 template <typename Dst, typename Src>
@@ -800,14 +879,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(const Dst& dst, const
 
 // Deal with "assume-aliasing"
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
     Dst& dst, const Src& src, const Func& func, std::enable_if_t<evaluator_assume_aliasing<Src>::value, void*> = 0) {
   typename plain_matrix_type<Src>::type tmp(src);
   call_assignment_no_alias(dst, tmp, func);
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
     Dst& dst, const Src& src, const Func& func, std::enable_if_t<!evaluator_assume_aliasing<Src>::value, void*> = 0) {
   call_assignment_no_alias(dst, src, func);
 }
@@ -815,14 +894,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(
 // by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template <typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(NoAlias<Dst, StorageBase>& dst,
-                                                                           const Src& src, const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(NoAlias<Dst, StorageBase>& dst, const Src& src,
+                                                                     const Func& func) {
   call_assignment_no_alias(dst.expression(), src, func);
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src,
-                                                                                    const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src,
+                                                                              const Func& func) {
   enum {
     NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
                        (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
@@ -861,14 +940,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_restricted_packet_assignment_no_
 }
 
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src) {
   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 
 template <typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
-                                                                                                 const Src& src,
-                                                                                                 const Func& func) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
+                                                                                           const Func& func) {
   // TODO check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
@@ -877,8 +955,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_al
   Assignment<Dst, Src, Func>::run(dst, src, func);
 }
 template <typename Dst, typename Src>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst,
-                                                                                                 const Src& src) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) {
   call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
 }
 
@@ -891,15 +968,44 @@ EIGEN_DEVICE_FUNC void check_for_aliasing(const Dst& dst, const Src& src);
 // both partial specialization+SFINAE without ambiguous specialization
 template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(DstXprType& dst, const SrcXprType& src,
+                                                                  const Functor& func) {
 #ifndef EIGEN_NO_DEBUG
-    internal::check_for_aliasing(dst, src);
+    if (!internal::is_constant_evaluated()) {
+      internal::check_for_aliasing(dst, src);
+    }
 #endif
 
     call_dense_assignment_loop(dst, src, func);
   }
 };
 
+template <typename DstXprType, typename SrcPlainObject, typename Weak>
+struct Assignment<DstXprType, CwiseNullaryOp<scalar_constant_op<typename DstXprType::Scalar>, SrcPlainObject>,
+                  assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Dense2Dense, Weak> {
+  using Scalar = typename DstXprType::Scalar;
+  using NullaryOp = scalar_constant_op<Scalar>;
+  using SrcXprType = CwiseNullaryOp<NullaryOp, SrcPlainObject>;
+  using Functor = assign_op<Scalar, Scalar>;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const Functor& /*func*/) {
+    eigen_fill_impl<DstXprType>::run(dst, src);
+  }
+};
+
+template <typename DstXprType, typename SrcPlainObject, typename Weak>
+struct Assignment<DstXprType, CwiseNullaryOp<scalar_zero_op<typename DstXprType::Scalar>, SrcPlainObject>,
+                  assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Dense2Dense, Weak> {
+  using Scalar = typename DstXprType::Scalar;
+  using NullaryOp = scalar_zero_op<Scalar>;
+  using SrcXprType = CwiseNullaryOp<NullaryOp, SrcPlainObject>;
+  using Functor = assign_op<Scalar, Scalar>;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const Functor& /*func*/) {
+    eigen_zero_impl<DstXprType>::run(dst, src);
+  }
+};
+
 // Generic assignment through evalTo.
 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
 // Note that the last template argument "Weak" is needed to make it possible to perform
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 5b566cd..ad11220 100644
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -89,7 +89,7 @@ class vml_assign_traits {
     static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \
       resize_if_allowed(dst, src, func);                                                                   \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                  \
-      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == LinearTraversal) {                     \
+      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == (int)LinearTraversal) {                \
         VMLOP(dst.size(), (const VMLTYPE *)src.nestedExpression().data(),                                  \
               (VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                     \
       } else {                                                                                             \
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index ca991ca..57b0322 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -200,16 +200,16 @@ class BandMatrix : public BandMatrixBase<BandMatrix<Scalar_, Rows, Cols, Supers,
       : m_coeffs(1 + supers + subs, cols), m_rows(rows), m_supers(supers), m_subs(subs) {}
 
   /** \returns the number of columns */
-  inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+  constexpr Index rows() const { return m_rows.value(); }
 
   /** \returns the number of rows */
-  inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
   /** \returns the number of super diagonals */
-  inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+  constexpr Index supers() const { return m_supers.value(); }
 
   /** \returns the number of sub diagonals */
-  inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+  constexpr Index subs() const { return m_subs.value(); }
 
   inline const CoefficientsType& coeffs() const { return m_coeffs; }
   inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -260,16 +260,16 @@ class BandMatrixWrapper
   }
 
   /** \returns the number of columns */
-  inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+  constexpr Index rows() const { return m_rows.value(); }
 
   /** \returns the number of rows */
-  inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
   /** \returns the number of super diagonals */
-  inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+  constexpr Index supers() const { return m_supers.value(); }
 
   /** \returns the number of sub diagonals */
-  inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+  constexpr Index subs() const { return m_subs.value(); }
 
   inline const CoefficientsType& coeffs() const { return m_coeffs; }
 
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 9b16ed2..39abff7 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -278,7 +278,7 @@ class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \sa MapBase::data() */
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const;
   EIGEN_DEVICE_FUNC inline Index innerStride() const;
   EIGEN_DEVICE_FUNC inline Index outerStride() const;
 #endif
@@ -289,13 +289,9 @@ class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
-    return m_startRow.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
-    return m_startCol.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
  protected:
   XprTypeNested m_xpr;
@@ -319,8 +315,7 @@ class BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel, true>
    * Adding an offset to nullptr is undefined behavior, so we must avoid it.
    */
   template <typename Scalar>
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base,
-                                                                                               Index offset) {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) {
     return base != nullptr ? base + offset : nullptr;
   }
 
@@ -378,30 +373,25 @@ class BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel, true>
     init();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const
-      EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const noexcept {
     return m_xpr;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index innerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
   }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
     return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startRow() const EIGEN_NOEXCEPT {
-    return m_startRow.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR StorageIndex startCol() const EIGEN_NOEXCEPT {
-    return m_startCol.value();
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
 
 #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index c629123..c414117 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -92,7 +92,7 @@ struct CommaInitializer {
 
   EIGEN_DEVICE_FUNC inline ~CommaInitializer()
 #if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
-      EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+      noexcept(false)  // Eigen::eigen_assert_exception
 #endif
   {
     finished();
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index c620600..60857e2 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -124,8 +124,7 @@ struct evaluator_base {
   // noncopyable:
   // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
   // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
 
  private:
   EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
@@ -143,23 +142,23 @@ struct evaluator_base {
 template <typename Scalar, int OuterStride>
 class plainobjectbase_evaluator_data {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr) {
 #ifndef EIGEN_INTERNAL_DEBUGGING
     EIGEN_UNUSED_VARIABLE(outerStride);
 #endif
     eigen_internal_assert(outerStride == OuterStride);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
   const Scalar* data;
 };
 
 template <typename Scalar>
 class plainobjectbase_evaluator_data<Scalar, Dynamic> {
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
       : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
   const Scalar* data;
 
  protected:
@@ -167,7 +166,7 @@ class plainobjectbase_evaluator_data<Scalar, Dynamic> {
 };
 
 template <typename Derived>
-struct evaluator<PlainObjectBase<Derived> > : evaluator_base<Derived> {
+struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
   typedef PlainObjectBase<Derived> PlainObjectType;
   typedef typename PlainObjectType::Scalar Scalar;
   typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
@@ -189,89 +188,106 @@ struct evaluator<PlainObjectBase<Derived> > : evaluator_base<Derived> {
                                                      : RowsAtCompileTime
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() : m_d(0, OuterStrideAtCompileTime) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const PlainObjectType& m)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
       : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
-    if (IsRowMajor)
-      return m_d.data[row * m_d.outerStride() + col];
-    else
-      return m_d.data[row + col * m_d.outerStride()];
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
+    return coeff(getIndex(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
-    if (IsRowMajor)
-      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
-    else
-      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
+    return coeffRef(getIndex(row, col));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return const_cast<Scalar*>(m_d.data)[index]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+    return const_cast<Scalar*>(m_d.data)[index];
+  }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    if (IsRowMajor)
-      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
-    else
-      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return packet<LoadMode, PacketType>(getIndex(row, col));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return ploadt<PacketType, LoadMode>(m_d.data + index);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
-    if (IsRowMajor)
-      return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
-    else
-      return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    writePacket<StoreMode, PacketType>(getIndex(row, col), x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
   }
 
  protected:
   plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+    return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
+  }
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-    : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > > {
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType> >(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-    : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > > {
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& m)
-      : evaluator<PlainObjectBase<XprType> >(m) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
 };
 
 // -------------------- Transpose --------------------
 
 template <typename ArgType>
-struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpose<ArgType> > {
+struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpose<ArgType>> {
   typedef Transpose<ArgType> XprType;
 
   enum {
@@ -298,25 +314,47 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpos
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_argImpl.template packet<LoadMode, PacketType>(col, row);
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return m_argImpl.template packet<LoadMode, PacketType>(index);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
     m_argImpl.template writePacket<StoreMode, PacketType>(col, row, x);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
     m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
+  }
+
  protected:
   evaluator<ArgType> m_argImpl;
 };
@@ -404,7 +442,7 @@ struct nullary_wrapper<Scalar, NullaryOp, false, false, false> {};
 #if 0 && EIGEN_COMP_MSVC > 0
 // Disable this ugly workaround. This is now handled in traits<Ref>::match,
 // but this piece of code might still become handly if some other weird compilation
-// erros pop up again.
+// errors pop up again.
 
 // MSVC exhibits a weird compilation error when
 // compiling:
@@ -460,13 +498,13 @@ struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
 #endif  // MSVC workaround
 
 template <typename NullaryOp, typename PlainObjectType>
-struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType> >
-    : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType> > {
+struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
+    : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
   typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
-  typedef internal::remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
+  typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
 
   enum {
-    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    CoeffReadCost = functor_traits<NullaryOp>::Cost,
 
     Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
              (HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
@@ -492,24 +530,36 @@ struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType> >
   }
 
   template <int LoadMode, typename PacketType, typename IndexType>
-  EIGEN_STRONG_INLINE PacketType packet(IndexType row, IndexType col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(IndexType row, IndexType col) const {
     return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
   }
 
   template <int LoadMode, typename PacketType, typename IndexType>
-  EIGEN_STRONG_INLINE PacketType packet(IndexType index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(IndexType index) const {
     return m_wrapper.template packetOp<PacketType>(m_functor, index);
   }
 
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(row, col);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(index);
+  }
+
  protected:
   const NullaryOp m_functor;
-  const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+  const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
 };
 
 // -------------------- CwiseUnaryOp --------------------
 
 template <typename UnaryOp, typename ArgType>
-struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> > {
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType>> {
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
@@ -536,15 +586,25 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_b
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -599,16 +659,11 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
   template <typename DstPacketType>
   using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
 
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index, Index col, Index packetSize) const {
-    return col + packetSize <= cols();
-  }
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index, Index packetSize) const {
-    return row + packetSize <= rows();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
+    return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index packetSize) const {
-    return index + packetSize <= size();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
+    return index + count + begin <= size();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
@@ -629,49 +684,94 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
   }
 
   template <int LoadMode, typename PacketType = SrcPacketType>
-  EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
     constexpr int PacketSize = unpacket_traits<PacketType>::size;
-    Index actualRow = IsRowMajor ? row : row + (offset * PacketSize);
-    Index actualCol = IsRowMajor ? col + (offset * PacketSize) : col;
-    eigen_assert(check_array_bounds(actualRow, actualCol, PacketSize) && "Array index out of bounds");
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
     return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
   }
   template <int LoadMode, typename PacketType = SrcPacketType>
-  EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
     constexpr int PacketSize = unpacket_traits<PacketType>::size;
-    Index actualIndex = index + (offset * PacketSize);
-    eigen_assert(check_array_bounds(actualIndex, PacketSize) && "Array index out of bounds");
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
     return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
   }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, begin, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
+  }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, begin, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
+  }
+
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
 
   // There is no source packet type with equal or fewer elements than DstPacketType.
   // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
   // For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
-  // The first iteration of the evaulation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
+  // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
   // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
-
-  // Instead, perform runtime check to determine if the load would access data outside the bounds of the array.
-  // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load.
-  // In either case, perform a vectorized cast of the source packet.
   template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
     constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
-    SrcPacketType src;
-    if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) {
-      src = srcPacket<SrcLoadMode>(row, col, 0);
-    } else {
-      Array<SrcType, SrcPacketSize, 1> srcArray;
-      for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k);
-      for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
-      src = pload<SrcPacketType>(srcArray.data());
-    }
-    return pcast<SrcPacketType, DstPacketType>(src);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
   }
   // Use the source packet type with the same size as DstPacketType, if it exists
   template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
@@ -680,14 +780,14 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
   }
   // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
   template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(row, col, 0),
                                                srcPacket<SrcLoadMode>(row, col, 1));
   }
   // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
   template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(row, col, 0), srcPacket<SrcLoadMode>(row, col, 1),
                                                srcPacket<SrcLoadMode>(row, col, 2),
@@ -695,7 +795,7 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
   }
   // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
   template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(
         srcPacket<SrcLoadMode>(row, col, 0), srcPacket<SrcLoadMode>(row, col, 1), srcPacket<SrcLoadMode>(row, col, 2),
@@ -703,25 +803,70 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
         srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
   }
 
-  // Analagous routines for linear access.
+  // packetSegment variants
   template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
     constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
-    SrcPacketType src;
-    if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) {
-      src = srcPacket<SrcLoadMode>(index, 0);
-    } else {
-      Array<SrcType, SrcPacketSize, 1> srcArray;
-      for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k);
-      for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0);
-      src = pload<SrcPacketType>(srcArray.data());
-    }
-    return pcast<SrcPacketType, DstPacketType>(src);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
+  // Analogous routines for linear access.
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
   }
   template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
     constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
     using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
     constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
@@ -729,18 +874,18 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
     return pcast<SizedSrcPacketType, DstPacketType>(srcPacket<SrcLoadMode, SizedSrcPacketType>(index, 0));
   }
   template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1));
   }
   template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1),
                                                srcPacket<SrcLoadMode>(index, 2), srcPacket<SrcLoadMode>(index, 3));
   }
   template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
-  EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
     constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
     return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1),
                                                srcPacket<SrcLoadMode>(index, 2), srcPacket<SrcLoadMode>(index, 3),
@@ -748,6 +893,55 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
                                                srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
   }
 
+  // packetSegment variants
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
@@ -762,17 +956,17 @@ struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, In
 
 // this is a ternary expression
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
-struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
-    : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>
+    : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
-  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> Base;
 
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
-    : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+    : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
 
   enum {
@@ -812,19 +1006,33 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode, PacketType>(row, col),
                                m_d.arg2Impl.template packet<LoadMode, PacketType>(row, col),
                                m_d.arg3Impl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode, PacketType>(index),
                                m_d.arg2Impl.template packet<LoadMode, PacketType>(index),
                                m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -840,20 +1048,52 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   Data m_d;
 };
 
+template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
+struct scalar_boolean_select_spec {
+  using DummyTernaryOp = scalar_boolean_select_op<Scalar, Scalar, bool>;
+  using DummyArg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>;
+  using DummyXprType = CwiseTernaryOp<DummyTernaryOp, Arg1, Arg2, DummyArg3>;
+
+  // only use the typed comparison if it is vectorized
+  static constexpr bool UseTyped = functor_traits<scalar_cmp_op<Scalar, Scalar, cmp, true>>::PacketAccess;
+  using CondScalar = std::conditional_t<UseTyped, Scalar, bool>;
+
+  using TernaryOp = scalar_boolean_select_op<Scalar, Scalar, CondScalar>;
+  using Arg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, UseTyped>, CmpLhsType, CmpRhsType>;
+  using XprType = CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>;
+
+  using Base = ternary_evaluator<XprType>;
+};
+
+// specialization for expressions like (a < b).select(c, d) to enable full vectorization
+template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
+struct evaluator<CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, bool>, Arg1, Arg2,
+                                CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>>>
+    : public scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>::Base {
+  using Helper = scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>;
+  using Base = typename Helper::Base;
+  using DummyXprType = typename Helper::DummyXprType;
+  using Arg3 = typename Helper::Arg3;
+  using XprType = typename Helper::XprType;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const DummyXprType& xpr)
+      : Base(XprType(xpr.arg1(), xpr.arg2(), Arg3(xpr.arg3().lhs(), xpr.arg3().rhs()))) {}
+};
+
 // -------------------- CwiseBinaryOp --------------------
 
 // this is a binary expression
 template <typename BinaryOp, typename Lhs, typename Rhs>
-struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
-  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> Base;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template <typename BinaryOp, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
-    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
 
   enum {
@@ -889,17 +1129,29 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode, PacketType>(row, col),
                                m_d.rhsImpl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode, PacketType>(index),
                                m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
  protected:
   // this helper permits to completely eliminate the functor if it is empty
   struct Data {
@@ -918,7 +1170,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
 
 template <typename UnaryOp, typename ArgType, typename StrideType>
 struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType, StrideType>, IndexBased>
-    : evaluator_base<CwiseUnaryView<UnaryOp, ArgType, StrideType> > {
+    : evaluator_base<CwiseUnaryView<UnaryOp, ArgType, StrideType>> {
   typedef CwiseUnaryView<UnaryOp, ArgType, StrideType> XprType;
 
   enum {
@@ -991,7 +1243,7 @@ struct mapbase_evaluator : evaluator_base<Derived> {
         m_innerStride(map.innerStride()),
         m_outerStride(map.outerStride()) {
     EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
-                                          internal::inner_stride_at_compile_time<Derived>::ret == 1),
+                                          inner_stride_at_compile_time<Derived>::ret == 1),
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -1011,42 +1263,66 @@ struct mapbase_evaluator : evaluator_base<Derived> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_data[index * m_innerStride.value()]; }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     PointerType ptr = m_data + row * rowStride() + col * colStride();
-    return internal::ploadt<PacketType, LoadMode>(ptr);
+    return ploadt<PacketType, LoadMode>(ptr);
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
     PointerType ptr = m_data + row * rowStride() + col * colStride();
-    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+    pstoret<Scalar, PacketType, StoreMode>(ptr, x);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
   }
 
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
     return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
   }
 
   PointerType m_data;
-  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
-  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+  const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
 template <typename PlainObjectType, int MapOptions, typename StrideType>
-struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType>>
     : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType> {
   typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
   typedef typename XprType::Scalar Scalar;
@@ -1079,13 +1355,13 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
 // -------------------- Ref --------------------
 
 template <typename PlainObjectType, int RefOptions, typename StrideType>
-struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType>>
     : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType> {
   typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
 
   enum {
-    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
-    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Alignment
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
@@ -1095,11 +1371,11 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
 // -------------------- Block --------------------
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool HasDirectAccess = internal::has_direct_access<ArgType>::ret>
+          bool HasDirectAccess = has_direct_access<ArgType>::ret>
 struct block_evaluator;
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
-struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>>
     : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
   typedef typename XprType::Scalar Scalar;
@@ -1150,7 +1426,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
 // no direct-access => dispatch to a unary evaluator
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
-    : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> > {
+    : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
@@ -1159,7 +1435,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
 
 template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
-    : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> > {
+    : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
@@ -1198,12 +1474,12 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_argImpl.template packet<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col);
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     if (ForwardLinearAccess)
       return m_argImpl.template packet<LoadMode, PacketType>(m_linear_offset.value() + index);
     else
@@ -1211,12 +1487,12 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
     return m_argImpl.template writePacket<StoreMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
     if (ForwardLinearAccess)
       return m_argImpl.template writePacket<StoreMode, PacketType>(m_linear_offset.value() + index, x);
     else
@@ -1224,6 +1500,39 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
                                                 x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
+                                                                  begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
+    else
+      return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
+                                                 begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
+                                                                        m_startCol.value() + col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
+                                                                          count);
+    else
+      return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                       RowsAtCompileTime == 1 ? index : 0, x, begin, count);
+  }
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
   linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
@@ -1267,60 +1576,16 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   }
 };
 
-// -------------------- Select --------------------
-// NOTE shall we introduce a ternary_evaluator?
-
-// TODO enable vectorization for Select
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
-    : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > {
-  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
-  enum {
-    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost +
-                    plain_enum_max(evaluator<ThenMatrixType>::CoeffReadCost, evaluator<ElseMatrixType>::CoeffReadCost),
-
-    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
-
-    Alignment = plain_enum_min(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& select)
-      : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
-    if (m_conditionImpl.coeff(row, col))
-      return m_thenImpl.coeff(row, col);
-    else
-      return m_elseImpl.coeff(row, col);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    if (m_conditionImpl.coeff(index))
-      return m_thenImpl.coeff(index);
-    else
-      return m_elseImpl.coeff(index);
-  }
-
- protected:
-  evaluator<ConditionMatrixType> m_conditionImpl;
-  evaluator<ThenMatrixType> m_thenImpl;
-  evaluator<ElseMatrixType> m_elseImpl;
-};
-
 // -------------------- Replicate --------------------
 
 template <typename ArgType, int RowFactor, int ColFactor>
-struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
-    : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> > {
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
+    : evaluator_base<Replicate<ArgType, RowFactor, ColFactor>> {
   typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
-  typedef typename internal::nested_eval<ArgType, Factor>::type ArgTypeNested;
-  typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+  typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
+  typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
 
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
@@ -1339,19 +1604,15 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     // try to avoid using modulo; this is a pure optimization strategy
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
-                             : RowFactor == 1                                  ? row
-                                                                               : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
-                             : ColFactor == 1                                  ? col
-                                                                               : col % m_cols.value();
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
 
     return m_argImpl.coeff(actual_row, actual_col);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // try to avoid using modulo; this is a pure optimization strategy
-    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
                                    : (RowFactor == 1 ? index : index % m_rows.value());
 
@@ -1359,26 +1620,39 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime == 1 ? 0
-                             : RowFactor == 1                                  ? row
-                                                                               : row % m_rows.value();
-    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime == 1 ? 0
-                             : ColFactor == 1                                  ? col
-                                                                               : col % m_cols.value();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
 
     return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime == 1
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
                                    ? (ColFactor == 1 ? index : index % m_cols.value())
                                    : (RowFactor == 1 ? index : index % m_rows.value());
 
     return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+                                   ? (ColFactor == 1 ? index : index % m_cols.value())
+                                   : (RowFactor == 1 ? index : index % m_rows.value());
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
+  }
+
  protected:
   const ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
@@ -1416,43 +1690,65 @@ struct evaluator_wrapper_base : evaluator_base<XprType> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     return m_argImpl.template packet<LoadMode, PacketType>(row, col);
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
     return m_argImpl.template packet<LoadMode, PacketType>(index);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
     m_argImpl.template writePacket<StoreMode>(row, col, x);
   }
 
   template <int StoreMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
     m_argImpl.template writePacket<StoreMode>(index, x);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
+  }
+
  protected:
   evaluator<ArgType> m_argImpl;
 };
 
 template <typename TArgType>
-struct unary_evaluator<MatrixWrapper<TArgType> > : evaluator_wrapper_base<MatrixWrapper<TArgType> > {
+struct unary_evaluator<MatrixWrapper<TArgType>> : evaluator_wrapper_base<MatrixWrapper<TArgType>> {
   typedef MatrixWrapper<TArgType> XprType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
-      : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression()) {}
+      : evaluator_wrapper_base<MatrixWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
 template <typename TArgType>
-struct unary_evaluator<ArrayWrapper<TArgType> > : evaluator_wrapper_base<ArrayWrapper<TArgType> > {
+struct unary_evaluator<ArrayWrapper<TArgType>> : evaluator_wrapper_base<ArrayWrapper<TArgType>> {
   typedef ArrayWrapper<TArgType> XprType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
-      : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression()) {}
+      : evaluator_wrapper_base<ArrayWrapper<TArgType>>(wrapper.nestedExpression()) {}
 };
 
 // -------------------- Reverse --------------------
@@ -1462,7 +1758,7 @@ template <typename PacketType, bool ReversePacket>
 struct reverse_packet_cond;
 
 template <typename ArgType, int Direction>
-struct unary_evaluator<Reverse<ArgType, Direction> > : evaluator_base<Reverse<ArgType, Direction> > {
+struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<ArgType, Direction>> {
   typedef Reverse<ArgType, Direction> XprType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -1513,42 +1809,98 @@ struct unary_evaluator<Reverse<ArgType, Direction> > : evaluator_base<Reverse<Ar
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
-      OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
-    };
-    typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
-    return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(
-        ReverseRow ? m_rows.value() - row - OffsetRow : row, ReverseCol ? m_cols.value() - col - OffsetCol : col));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
-    enum { PacketSize = unpacket_traits<PacketType>::size };
-    return preverse(
-        m_argImpl.template packet<LoadMode, PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
-    // FIXME we could factorize some code with packet(i,j)
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
-      OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1
-    };
-    typedef internal::reverse_packet_cond<PacketType, ReversePacket> reverse_packet;
-    m_argImpl.template writePacket<LoadMode>(ReverseRow ? m_rows.value() - row - OffsetRow : row,
-                                             ReverseCol ? m_cols.value() - col - OffsetCol : col,
-                                             reverse_packet::run(x));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
   }
 
   template <int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
-    enum { PacketSize = unpacket_traits<PacketType>::size };
-    m_argImpl.template writePacket<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    return reverse_packet::run(
+        m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
   }
 
  protected:
@@ -1563,7 +1915,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> > : evaluator_base<Reverse<Ar
 // -------------------- Diagonal --------------------
 
 template <typename ArgType, int DiagIndex>
-struct evaluator<Diagonal<ArgType, DiagIndex> > : evaluator_base<Diagonal<ArgType, DiagIndex> > {
+struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType, DiagIndex>> {
   typedef Diagonal<ArgType, DiagIndex> XprType;
 
   enum {
@@ -1599,13 +1951,13 @@ struct evaluator<Diagonal<ArgType, DiagIndex> > : evaluator_base<Diagonal<ArgTyp
 
  protected:
   evaluator<ArgType> m_argImpl;
-  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+  const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
     return m_index.value() > 0 ? 0 : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
     return m_index.value() > 0 ? m_index.value() : 0;
   }
 };
@@ -1622,10 +1974,10 @@ template <typename ArgType>
 class EvalToTemp;
 
 template <typename ArgType>
-struct traits<EvalToTemp<ArgType> > : public traits<ArgType> {};
+struct traits<EvalToTemp<ArgType>> : public traits<ArgType> {};
 
 template <typename ArgType>
-class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType> >::type {
+class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType>>::type {
  public:
   typedef typename dense_xpr_base<EvalToTemp>::type Base;
   EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
@@ -1634,16 +1986,16 @@ class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType> >::type {
 
   const ArgType& arg() const { return m_arg; }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); }
+  constexpr Index rows() const noexcept { return m_arg.rows(); }
 
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); }
+  constexpr Index cols() const noexcept { return m_arg.cols(); }
 
  private:
   const ArgType& m_arg;
 };
 
 template <typename ArgType>
-struct evaluator<EvalToTemp<ArgType> > : public evaluator<typename ArgType::PlainObject> {
+struct evaluator<EvalToTemp<ArgType>> : public evaluator<typename ArgType::PlainObject> {
   typedef EvalToTemp<ArgType> XprType;
   typedef typename ArgType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index aa79b60..e2b2da5 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -108,12 +108,12 @@ class CwiseBinaryOp : public CwiseBinaryOpImpl<BinaryOp, LhsType, RhsType,
     eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
                                                                                              : m_lhs.rows();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
     // return the fixed size type if available to enable compile time optimizations
     return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
                                                                                              : m_lhs.cols();
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 39c33cf..e4c5fed 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -71,9 +71,13 @@ class CwiseNullaryOp : public internal::dense_xpr_base<CwiseNullaryOp<NullaryOp,
     eigen_assert(rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 &&
                  (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
   }
+  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
+      : CwiseNullaryOp(RowsAtCompileTime == 1 ? 1 : size, RowsAtCompileTime == 1 ? size : 1, func) {
+    EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
 
   /** \returns the functor representing the nullary operation */
   EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
@@ -231,8 +235,7 @@ DenseBase<Derived>::Constant(const Scalar& value) {
  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<
-    Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low, high, size));
@@ -243,8 +246,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
  * \sa LinSpaced(const Scalar&, const Scalar&)
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<
-    Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -283,7 +285,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 }
 
 /**
- * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+ * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
  * Special version for fixed size types which does not require the size parameter.
  */
 template <typename Derived>
@@ -343,7 +345,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) {
-  return derived() = Constant(rows(), cols(), val);
+  internal::eigen_fill_impl<Derived>::run(derived(), val);
+  return derived();
 }
 
 /** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
@@ -479,9 +482,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setEqualSpace
  * \sa Zero(), Zero(Index)
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Zero(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero(
     Index rows, Index cols) {
-  return Constant(rows, cols, Scalar(0));
+  return ZeroReturnType(rows, cols);
 }
 
 /** \returns an expression of a zero vector.
@@ -501,9 +504,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::Constan
  * \sa Zero(), Zero(Index,Index)
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Zero(
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero(
     Index size) {
-  return Constant(size, Scalar(0));
+  return ZeroReturnType(size);
 }
 
 /** \returns an expression of a fixed-size zero matrix or vector.
@@ -517,8 +520,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::Constan
  * \sa Zero(Index), Zero(Index,Index)
  */
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Zero() {
-  return Constant(Scalar(0));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero() {
+  return ZeroReturnType(RowsAtCompileTime, ColsAtCompileTime);
 }
 
 /** \returns true if *this is approximately equal to the zero matrix,
@@ -547,7 +550,8 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() {
-  return setConstant(Scalar(0));
+  internal::eigen_zero_impl<Derived>::run(derived());
+  return derived();
 }
 
 /** Resizes to the given \a size, and sets all coefficients in this expression to zero.
@@ -562,7 +566,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() {
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index newSize) {
   resize(newSize);
-  return setConstant(Scalar(0));
+  return setZero();
 }
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
@@ -578,7 +582,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index rows, Index cols) {
   resize(rows, cols);
-  return setConstant(Scalar(0));
+  return setZero();
 }
 
 /** Resizes to the given size, changing only the number of columns, and sets all
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 42ed459..94ec1a0 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -60,8 +60,8 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
 
   /** \returns the functor representing the unary operation */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 49b1410..7dd7623 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -66,13 +66,13 @@ class CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, false>
 
   EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeffRef(0)); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? int(StrideType::InnerStrideAtCompileTime)
                                                      : derived().nestedExpression().innerStride() *
                                                            sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? int(StrideType::OuterStrideAtCompileTime)
                                                      : derived().nestedExpression().outerStride() *
                                                            sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
@@ -145,8 +145,8 @@ class CwiseUnaryView : public internal::CwiseUnaryViewImpl<ViewOp, MatrixType, S
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns the functor representing unary operation */
   EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 5ab54ef..c81e1d1 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -208,7 +208,7 @@ class DenseBase
    * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
    * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
    * column-major matrix, and the number of rows for a row-major matrix. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const {
     return IsVectorAtCompileTime ? 1 : int(IsRowMajor) ? this->rows() : this->cols();
   }
 
@@ -217,7 +217,7 @@ class DenseBase
    * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
    * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
    * column-major matrix, and the number of columns for a row-major matrix. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const {
     return IsVectorAtCompileTime ? this->size() : int(IsRowMajor) ? this->cols() : this->rows();
   }
 
@@ -243,6 +243,8 @@ class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   /** \internal Represents a matrix with all coefficients equal to one another*/
   typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> ConstantReturnType;
+  /** \internal Represents a matrix with all coefficients equal to zero*/
+  typedef CwiseNullaryOp<internal::scalar_zero_op<Scalar>, PlainObject> ZeroReturnType;
   /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
   EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>, PlainObject> SequentialLinSpacedReturnType;
   /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
@@ -304,12 +306,12 @@ class DenseBase
   EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(Index size, const Scalar& value);
   EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value);
 
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size,
-                                                                                            const Scalar& low,
-                                                                                            const Scalar& high);
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t,
-                                                                                            const Scalar& low,
-                                                                                            const Scalar& high);
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low,
+                                                                           const Scalar& high);
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, const Scalar& low,
+                                                                           const Scalar& high);
 
   EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low,
                                                                            const Scalar& high);
@@ -328,9 +330,9 @@ class DenseBase
   template <typename CustomNullaryOp>
   EIGEN_DEVICE_FUNC static const CwiseNullaryOp<CustomNullaryOp, PlainObject> NullaryExpr(const CustomNullaryOp& func);
 
-  EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
-  EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
-  EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero(Index size);
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero();
   EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
   EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
   EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
@@ -365,7 +367,12 @@ class DenseBase
   EIGEN_DEVICE_FUNC inline bool allFinite() const;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
+  template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
+  template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
 
   typedef internal::add_const_on_value_type_t<typename internal::eval<Derived>::type> EvalReturnType;
   /** \returns the matrix or vector obtained by evaluating this expression.
@@ -595,6 +602,13 @@ class DenseBase
   inline const_iterator end() const;
   inline const_iterator cend() const;
 
+  using RealViewReturnType = std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<Derived>, Derived&>;
+  using ConstRealViewReturnType =
+      std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<const Derived>, const Derived&>;
+
+  EIGEN_DEVICE_FUNC RealViewReturnType realView();
+  EIGEN_DEVICE_FUNC ConstRealViewReturnType realView() const;
+
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
@@ -621,17 +635,19 @@ class DenseBase
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
   /** Default constructor. Do nothing. */
+#ifdef EIGEN_INTERNAL_DEBUGGING
   EIGEN_DEVICE_FUNC constexpr DenseBase() {
     /* Just checks for self-consistency of the flags.
      * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
      */
-#ifdef EIGEN_INTERNAL_DEBUGGING
     EIGEN_STATIC_ASSERT(
         (internal::check_implication(MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1, int(IsRowMajor)) &&
          internal::check_implication(MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1, int(!IsRowMajor))),
         INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION)
-#endif
   }
+#else
+  EIGEN_DEVICE_FUNC constexpr DenseBase() = default;
+#endif
 
  private:
   EIGEN_DEVICE_FUNC explicit DenseBase(int);
@@ -640,6 +656,18 @@ class DenseBase
   EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
 };
 
+/** Free-function swap.
+ */
+template <typename DerivedA, typename DerivedB>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    // Use forwarding references to capture all combinations of cv-qualified l+r-value cases.
+    std::enable_if_t<std::is_base_of<DenseBase<std::decay_t<DerivedA>>, std::decay_t<DerivedA>>::value &&
+                         std::is_base_of<DenseBase<std::decay_t<DerivedB>>, std::decay_t<DerivedB>>::value,
+                     void>
+    swap(DerivedA&& a, DerivedB&& b) {
+  a.swap(b);
+}
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_DENSEBASE_H
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 48c6d73..377df57 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -45,10 +45,16 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
   // - This is the return type of the coeff() method.
   // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
   // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+  // - The DirectAccessBit means exactly that the underlying data of coefficients can be directly accessed as a plain
+  // strided array, which means exactly that the underlying data of coefficients does exist in memory, which means
+  // exactly that the coefficients is const-referencable, which means exactly that we can have coeff() return a const
+  // reference. For example, Map<const Matrix> have DirectAccessBit but not LvalueBit, so that Map<const Matrix>.coeff()
+  // does points to a const Scalar& which exists in memory, while does not allow coeffRef() as it would not provide a
+  // lvalue. Notice that DirectAccessBit and LvalueBit are mutually orthogonal.
   // - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems
   // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
   // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
-  typedef std::conditional_t<bool(internal::traits<Derived>::Flags& LvalueBit), const Scalar&,
+  typedef std::conditional_t<bool(internal::traits<Derived>::Flags&(LvalueBit | DirectAccessBit)), const Scalar&,
                              std::conditional_t<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>>
       CoeffReturnType;
 
@@ -89,12 +95,12 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    *
    * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
     return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
   }
 
@@ -102,7 +108,7 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    *
    * \sa operator()(Index,Index), operator[](Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeff(row, col);
   }
@@ -122,7 +128,7 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -137,7 +143,7 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType operator[](Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -154,32 +160,32 @@ class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
    * z() const, w() const
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType operator()(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
     eigen_assert(index >= 0 && index < size());
     return coeff(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType x() const { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType y() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType z() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType w() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -297,7 +303,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    *
    * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
     eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return internal::evaluator<Derived>(derived()).coeffRef(row, col);
   }
@@ -311,7 +317,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index row, Index col) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
     eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
     return coeffRef(row, col);
   }
@@ -331,7 +337,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
     EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
                         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
     eigen_internal_assert(index >= 0 && index < size());
@@ -345,7 +351,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
     EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                         THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
     eigen_assert(index >= 0 && index < size());
@@ -361,32 +367,32 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
    */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
     eigen_assert(index >= 0 && index < size());
     return coeffRef(index);
   }
 
   /** equivalent to operator[](0).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& x() { return (*this)[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
 
   /** equivalent to operator[](1).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& y() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
     return (*this)[1];
   }
 
   /** equivalent to operator[](2).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& z() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
     return (*this)[2];
   }
 
   /** equivalent to operator[](3).  */
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& w() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
     EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
     return (*this)[3];
   }
@@ -420,33 +426,29 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
    *
    * \sa outerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return derived().innerStride(); }
 
   /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
    *          in a column-major matrix).
    *
    * \sa innerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return derived().outerStride(); }
 
   // FIXME shall we remove it ?
-  EIGEN_CONSTEXPR inline Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
+  constexpr Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
 
   /** \returns the pointer increment between two consecutive rows.
    *
    * \sa innerStride(), outerStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const {
-    return Derived::IsRowMajor ? outerStride() : innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const { return Derived::IsRowMajor ? outerStride() : innerStride(); }
 
   /** \returns the pointer increment between two consecutive columns.
    *
    * \sa innerStride(), outerStride(), rowStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const {
-    return Derived::IsRowMajor ? innerStride() : outerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const { return Derived::IsRowMajor ? innerStride() : outerStride(); }
 };
 
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
@@ -477,25 +479,23 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors> : public DenseCoeffsBase<De
    *
    * \sa outerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
 
   /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
    *          in a column-major matrix).
    *
    * \sa innerStride(), rowStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
 
   // FIXME shall we remove it ?
-  EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT {
-    return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
-  }
+  constexpr Index stride() const noexcept { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
 
   /** \returns the pointer increment between two consecutive rows.
    *
    * \sa innerStride(), outerStride(), colStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rowStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
     return Derived::IsRowMajor ? outerStride() : innerStride();
   }
 
@@ -503,7 +503,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors> : public DenseCoeffsBase<De
    *
    * \sa innerStride(), outerStride(), rowStride()
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index colStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
     return Derived::IsRowMajor ? innerStride() : outerStride();
   }
 };
@@ -512,7 +512,7 @@ namespace internal {
 
 template <int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl {
-  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; }
+  static constexpr Index run(const Derived&) noexcept { return 0; }
 };
 
 template <int Alignment, typename Derived>
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index f616939..d62586c 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -27,622 +27,550 @@ namespace Eigen {
 
 namespace internal {
 
-struct constructor_without_unaligned_array_assert {};
+#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
+#else
+#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)                                        \
+  eigen_assert((is_constant_evaluated() || (std::uintptr_t(array) % Alignment == 0)) &&     \
+               "this assertion is explained here: "                                         \
+               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+               " **** READ THIS WEB PAGE !!! ****");
+#endif
 
-template <typename T, int Size>
-EIGEN_DEVICE_FUNC constexpr void check_static_allocation_size() {
-// if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
 #if EIGEN_STACK_ALLOCATION_LIMIT
-  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+#define EIGEN_MAKE_STACK_ALLOCATION_ASSERT(X) \
+  EIGEN_STATIC_ASSERT(X <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG)
+#else
+#define EIGEN_MAKE_STACK_ALLOCATION_ASSERT(X)
 #endif
-}
 
 /** \internal
  * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
  * to 16 bytes boundary if the total size is a multiple of 16 bytes.
  */
+
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions & DontAlign) ? 0 : compute_default_alignment<T, Size>::value>
 struct plain_array {
-  T array[Size];
-
-  EIGEN_DEVICE_FUNC constexpr plain_array() { check_static_allocation_size<T, Size>(); }
-
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
-    check_static_allocation_size<T, Size>();
-  }
-};
-
-#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
+  EIGEN_ALIGN_TO_BOUNDARY(Alignment) T array[Size];
+#if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
 #else
-#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)                                                \
-  eigen_assert((internal::is_constant_evaluated() || (std::uintptr_t(array) & (sizemask)) == 0) && \
-               "this assertion is explained here: "                                                \
-               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html"        \
-               " **** READ THIS WEB PAGE !!! ****");
-#endif
-
-template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 8> {
-  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
-
-  EIGEN_DEVICE_FUNC constexpr plain_array() {
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
-    check_static_allocation_size<T, Size>();
-  }
-
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
-    check_static_allocation_size<T, Size>();
-  }
-};
-
-template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 16> {
-  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
-
-  EIGEN_DEVICE_FUNC constexpr plain_array() {
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
-    check_static_allocation_size<T, Size>();
-  }
-
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
-    check_static_allocation_size<T, Size>();
-  }
-};
-
-template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 32> {
-  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
-
-  EIGEN_DEVICE_FUNC constexpr plain_array() {
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
-    check_static_allocation_size<T, Size>();
-  }
-
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
-    check_static_allocation_size<T, Size>();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
+    EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T))
   }
+#endif
 };
 
 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 64> {
-  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
-
-  EIGEN_DEVICE_FUNC constexpr plain_array() {
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
-    check_static_allocation_size<T, Size>();
-  }
-
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {
-    check_static_allocation_size<T, Size>();
-  }
+struct plain_array<T, Size, MatrixOrArrayOptions, 0> {
+  T array[Size];
+#if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() { EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T)) }
+#endif
 };
 
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment> {
   T array[1];
-  EIGEN_DEVICE_FUNC constexpr plain_array() {}
-  EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
 };
 
-struct plain_array_helper {
-  template <typename T, int Size, int MatrixOrArrayOptions, int Alignment>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void copy(
-      const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
-      plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
-    smart_copy(src.array, src.array + size, dst.array);
-  }
-
-  template <typename T, int Size, int MatrixOrArrayOptions, int Alignment>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a,
-                                                         const Eigen::Index a_size,
-                                                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b,
-                                                         const Eigen::Index b_size) {
-    if (a_size < b_size) {
-      std::swap_ranges(b.array, b.array + a_size, a.array);
-      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
-    } else if (a_size > b_size) {
-      std::swap_ranges(a.array, a.array + b_size, b.array);
-      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
-    } else {
-      std::swap_ranges(a.array, a.array + a_size, b.array);
-    }
-  }
-};
-
-}  // end namespace internal
-
-/** \internal
- *
- * \class DenseStorage
- * \ingroup Core_Module
- *
- * \brief Stores the data of a matrix
- *
- * This class stores the data of fixed-size, dynamic-size or mixed matrices
- * in a way as compact as possible.
- *
- * \sa Matrix
- */
-template <typename T, int Size, int Rows_, int Cols_, int Options_>
-class DenseStorage;
+template <typename T, int Size, int Options, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap_plain_array(plain_array<T, Size, Options, Alignment>& a,
+                                                                      plain_array<T, Size, Options, Alignment>& b,
+                                                                      Index a_size, Index b_size) {
+  Index common_size = numext::mini(a_size, b_size);
+  std::swap_ranges(a.array, a.array + common_size, b.array);
+  if (a_size > b_size)
+    smart_copy(a.array + common_size, a.array + a_size, b.array + common_size);
+  else if (b_size > a_size)
+    smart_copy(b.array + common_size, b.array + b_size, a.array + common_size);
+}
 
-// purely fixed-size matrix
-template <typename T, int Size, int Rows_, int Cols_, int Options_>
-class DenseStorage {
-  internal::plain_array<T, Size, Options_> m_data;
+template <typename T, int Size, int Rows, int Cols, int Options>
+class DenseStorage_impl {
+  plain_array<T, Size, Options> m_data;
 
  public:
-  constexpr EIGEN_DEVICE_FUNC DenseStorage(){EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(
-      Index size =
-          Size)} EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-#if defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
-      : m_data(other.m_data){EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)}
+#ifndef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
 #else
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    smart_copy(other.m_data.array, other.m_data.array + Size, m_data.array);
+  }
 #endif
-        EIGEN_DEVICE_FUNC constexpr DenseStorage
-        &
-        operator=(const DenseStorage&) = default;
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&&) = default;
-  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&&) = default;
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) {
-    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    eigen_internal_assert(size == rows * cols && rows == Rows_ && cols == Cols_);
-    EIGEN_UNUSED_VARIABLE(size);
-    EIGEN_UNUSED_VARIABLE(rows);
-    EIGEN_UNUSED_VARIABLE(cols);
-  }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_data, other.m_data); }
-  EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
-  EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT { return Cols_; }
-  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index) {}
-  EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index) {}
-  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
-  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
-};
-
-// null matrix
-template <typename T, int Rows_, int Cols_, int Options_>
-class DenseStorage<T, 0, Rows_, Cols_, Options_> {
- public:
-  static_assert(Rows_ * Cols_ == 0, "The fixed number of rows times columns must equal the storage size.");
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() {}
-  EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) { return *this; }
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index, Index) {}
-  EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage&) {}
-  EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
-  EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT { return Cols_; }
-  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index) {}
-  EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index) {}
-  EIGEN_DEVICE_FUNC constexpr const T* data() const { return 0; }
-  EIGEN_DEVICE_FUNC constexpr T* data() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    numext::swap(m_data, other.m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
+                                                                          Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
-
-// more specializations for null matrices; these are necessary to resolve ambiguities
-template <typename T, int Options_>
-class DenseStorage<T, 0, Dynamic, Dynamic, Options_> {
-  Index m_rows;
-  Index m_cols;
+template <typename T, int Size, int Cols, int Options>
+class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows), m_cols(other.m_cols) {}
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_rows(other.m_rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index /*cols*/)
+      : m_rows(rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
     m_rows = other.m_rows;
-    m_cols = other.m_cols;
     return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {
-    eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
-  }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
     numext::swap(m_rows, other.m_rows);
-    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rows; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_cols; }
-  EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
     m_rows = rows;
-    m_cols = cols;
-    eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
   }
-  EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
     m_rows = rows;
-    m_cols = cols;
-    eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size.");
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return nullptr; }
-  EIGEN_DEVICE_FUNC T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
-
-template <typename T, int Rows_, int Options_>
-class DenseStorage<T, 0, Rows_, Dynamic, Options_> {
-  Index m_cols;
+template <typename T, int Size, int Rows, int Options>
+class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-  EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_cols(other.m_cols) {}
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index /*rows*/, Index cols)
+      : m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
     m_cols = other.m_cols;
     return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {
-    eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
+    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_cols, other.m_cols); }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT { return m_cols; }
-  EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
     m_cols = cols;
-    eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
   }
-  EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
     m_cols = cols;
-    eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size.");
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return nullptr; }
-  EIGEN_DEVICE_FUNC T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
-
-template <typename T, int Cols_, int Options_>
-class DenseStorage<T, 0, Dynamic, Cols_, Options_> {
-  Index m_rows;
+template <typename T, int Size, int Options>
+class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_rows = 0;
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-  EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {}
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows) {}
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_rows(other.m_rows), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index cols)
+      : m_rows(rows), m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
     m_rows = other.m_rows;
+    m_cols = other.m_cols;
     return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {
-    eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
+    numext::swap(m_rows, other.m_rows);
+    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_rows, other.m_rows); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT { return m_rows; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT { return Cols_; }
-  EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
-    eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
     m_rows = rows;
-    eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size.");
+    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return nullptr; }
-  EIGEN_DEVICE_FUNC T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
-
-// dynamic-size matrix with fixed-size storage
-template <typename T, int Size, int Options_>
-class DenseStorage<T, Size, Dynamic, Dynamic, Options_> {
-  internal::plain_array<T, Size, Options_> m_data;
-  Index m_rows;
-  Index m_cols;
+// null matrix variants
+template <typename T, int Rows, int Cols, int Options>
+class DenseStorage_impl<T, 0, Rows, Cols, Options> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl&) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
+                                                                          Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+};
+template <typename T, int Cols, int Options>
+class DenseStorage_impl<T, 0, Dynamic, Cols, Options> {
+  Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(), m_rows(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols) {
-    internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      m_rows = other.m_rows;
-      m_cols = other.m_cols;
-      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
-    }
-    return *this;
-  }
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-    internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/)
+      : m_rows(rows) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_rows, other.m_rows);
-    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_rows; }
-  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_cols; }
-  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
     m_rows = rows;
-    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC constexpr void resize(Index, Index rows, Index cols) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
     m_rows = rows;
-    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
-  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
 };
-
-// dynamic-size matrix with fixed-size storage and fixed width
-template <typename T, int Size, int Cols_, int Options_>
-class DenseStorage<T, Size, Dynamic, Cols_, Options_> {
-  internal::plain_array<T, Size, Options_> m_data;
-  Index m_rows;
+template <typename T, int Rows, int Options>
+class DenseStorage_impl<T, 0, Rows, Dynamic, Options> {
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_rows(0) {}
-  EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows) {
-    internal::plain_array_helper::copy(other.m_data, m_rows * Cols_, m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols)
+      : m_cols(cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_cols, other.m_cols);
   }
-
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      m_rows = other.m_rows;
-      internal::plain_array_helper::copy(other.m_data, m_rows * Cols_, m_data);
-    }
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-    internal::plain_array_helper::swap(m_data, m_rows * Cols_, other.m_data, other.m_rows * Cols_);
-    numext::swap(m_rows, other.m_rows);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return m_rows; }
-  EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return Cols_; }
-  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
-  EIGEN_DEVICE_FUNC constexpr void resize(Index, Index rows, Index) { m_rows = rows; }
-  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
-  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
 };
-
-// dynamic-size matrix with fixed-size storage and fixed height
-template <typename T, int Size, int Rows_, int Options_>
-class DenseStorage<T, Size, Rows_, Dynamic, Options_> {
-  internal::plain_array<T, Size, Options_> m_data;
-  Index m_cols;
+template <typename T, int Options>
+class DenseStorage_impl<T, 0, Dynamic, Dynamic, Options> {
+  Index m_rows = 0;
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_cols(0) {}
-  EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols) {
-    internal::plain_array_helper::copy(other.m_data, Rows_ * m_cols, m_data);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      m_cols = other.m_cols;
-      internal::plain_array_helper::copy(other.m_data, Rows_ * m_cols, m_data);
-    }
-    return *this;
-  }
-  EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-    internal::plain_array_helper::swap(m_data, Rows_ * m_cols, other.m_data, Rows_ * other.m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols)
+      : m_rows(rows), m_cols(cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_rows, other.m_rows);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return Rows_; }
-  EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return m_cols; }
-  EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-  EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index cols) { m_cols = cols; }
-  EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; }
-  EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
 };
-
-// purely dynamic matrix.
-template <typename T, int Options_>
-class DenseStorage<T, Dynamic, Dynamic, Dynamic, Options_> {
-  T* m_data;
-  Index m_rows;
-  Index m_cols;
+// fixed-size matrix with dynamic memory allocation not currently supported
+template <typename T, int Rows, int Cols, int Options>
+class DenseStorage_impl<T, Dynamic, Rows, Cols, Options> {};
+// dynamic-sized variants
+template <typename T, int Cols, int Options>
+class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_rows = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(0), m_rows(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)),
-        m_rows(rows),
-        m_cols(cols) {
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index /*cols*/)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    eigen_internal_assert(size == rows * cols && rows >= 0 && cols >= 0);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(other.m_rows * other.m_cols)),
-        m_rows(other.m_rows),
-        m_cols(other.m_cols) {
-    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows * m_cols)
-    internal::smart_copy(other.m_data, other.m_data + other.m_rows * other.m_cols, m_data);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      DenseStorage tmp(other);
-      this->swap(tmp);
-    }
-    return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT : m_data(std::move(other.m_data)),
-                                                                        m_rows(std::move(other.m_rows)),
-                                                                        m_cols(std::move(other.m_cols)) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_rows(other.m_rows) {
     other.m_data = nullptr;
     other.m_rows = 0;
-    other.m_cols = 0;
   }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT {
-    numext::swap(m_data, other.m_data);
-    numext::swap(m_rows, other.m_rows);
-    numext::swap(m_cols, other.m_cols);
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC ~DenseStorage() {
-    internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, m_rows * m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
   }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
-    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT { return m_rows; }
-  EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT { return m_cols; }
-  void conservativeResize(Index size, Index rows, Index cols) {
-    m_data =
-        internal::conditional_aligned_realloc_new_auto<T, (Options_ & DontAlign) == 0>(m_data, size, m_rows * m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index rows, Index /*cols*/) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
     m_rows = rows;
-    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols) {
-    if (size != m_rows * m_cols) {
-      internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, m_rows * m_cols);
-      if (size > 0)  // >0 and not simply !=0 to let the compiler knows that size cannot be negative
-        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size);
-      else
-        m_data = 0;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index rows, Index /*cols*/) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
     m_rows = rows;
-    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return m_data; }
-  EIGEN_DEVICE_FUNC T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
 };
-
-// matrix with dynamic width and fixed height (so that matrix has dynamic size).
-template <typename T, int Rows_, int Options_>
-class DenseStorage<T, Dynamic, Rows_, Dynamic, Options_> {
-  T* m_data;
-  Index m_cols;
+template <typename T, int Rows, int Options>
+class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_cols(0) {}
-  explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-  EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_cols(cols) {
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index /*rows*/, Index cols)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    eigen_internal_assert(size == rows * cols && rows == Rows_ && cols >= 0);
-    EIGEN_UNUSED_VARIABLE(rows);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(Rows_ * other.m_cols)),
-        m_cols(other.m_cols) {
-    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols * Rows_)
-    internal::smart_copy(other.m_data, other.m_data + Rows_ * m_cols, m_data);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      DenseStorage tmp(other);
-      this->swap(tmp);
-    }
-    return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT : m_data(std::move(other.m_data)),
-                                                                        m_cols(std::move(other.m_cols)) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_cols = 0;
   }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT {
-    numext::swap(m_data, other.m_data);
-    numext::swap(m_cols, other.m_cols);
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC ~DenseStorage() {
-    internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, Rows_ * m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
   }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; }
-  EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT { return m_cols; }
-  EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) {
-    m_data =
-        internal::conditional_aligned_realloc_new_auto<T, (Options_ & DontAlign) == 0>(m_data, size, Rows_ * m_cols);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index /*rows*/, Index cols) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols) {
-    if (size != Rows_ * m_cols) {
-      internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, Rows_ * m_cols);
-      if (size > 0)  // >0 and not simply !=0 to let the compiler knows that size cannot be negative
-        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size);
-      else
-        m_data = 0;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index /*rows*/, Index cols) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
     m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return m_data; }
-  EIGEN_DEVICE_FUNC T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
 };
-
-// matrix with dynamic height and fixed width (so that matrix has dynamic size).
-template <typename T, int Cols_, int Options_>
-class DenseStorage<T, Dynamic, Dynamic, Cols_, Options_> {
-  T* m_data;
-  Index m_rows;
+template <typename T, int Options>
+class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_rows = 0;
+  Index m_cols = 0;
 
  public:
-  EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0) {}
-  explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-  EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_rows(rows) {
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index cols)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows), m_cols(cols) {
     EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-    eigen_internal_assert(size == rows * cols && rows >= 0 && cols == Cols_);
-    EIGEN_UNUSED_VARIABLE(cols);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(other.m_rows * Cols_)),
-        m_rows(other.m_rows) {
-    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows * Cols_)
-    internal::smart_copy(other.m_data, other.m_data + other.m_rows * Cols_, m_data);
-  }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) {
-    if (this != &other) {
-      DenseStorage tmp(other);
-      this->swap(tmp);
-    }
-    return *this;
   }
-  EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT : m_data(std::move(other.m_data)),
-                                                                        m_rows(std::move(other.m_rows)) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {
     other.m_data = nullptr;
     other.m_rows = 0;
+    other.m_cols = 0;
   }
-  EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT {
-    numext::swap(m_data, other.m_data);
-    numext::swap(m_rows, other.m_rows);
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
     return *this;
   }
-  EIGEN_DEVICE_FUNC ~DenseStorage() {
-    internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, Cols_ * m_rows);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
   }
-  EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
     numext::swap(m_data, other.m_data);
     numext::swap(m_rows, other.m_rows);
+    numext::swap(m_cols, other.m_cols);
   }
-  EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT { return m_rows; }
-  EIGEN_DEVICE_FUNC static constexpr Index cols(void) { return Cols_; }
-  void conservativeResize(Index size, Index rows, Index) {
-    m_data =
-        internal::conditional_aligned_realloc_new_auto<T, (Options_ & DontAlign) == 0>(m_data, size, m_rows * Cols_);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index rows, Index cols) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
     m_rows = rows;
+    m_cols = cols;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index) {
-    if (size != m_rows * Cols_) {
-      internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, Cols_ * m_rows);
-      if (size > 0)  // >0 and not simply !=0 to let the compiler knows that size cannot be negative
-        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size);
-      else
-        m_data = 0;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index rows, Index cols) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
     m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+};
+template <typename T, int Size, int Rows, int Cols>
+struct use_default_move {
+  static constexpr bool DynamicObject = Size == Dynamic;
+  static constexpr bool TrivialObject =
+      (!NumTraits<T>::RequireInitialization) && (Rows >= 0) && (Cols >= 0) && (Size == Rows * Cols);
+  static constexpr bool value = DynamicObject || TrivialObject;
+};
+}  // end namespace internal
+
+/** \internal
+ *
+ * \class DenseStorage_impl
+ * \ingroup Core_Module
+ *
+ * \brief Stores the data of a matrix
+ *
+ * This class stores the data of fixed-size, dynamic-size or mixed matrices
+ * in a way as compact as possible.
+ *
+ * \sa Matrix
+ */
+template <typename T, int Size, int Rows, int Cols, int Options,
+          bool Trivial = internal::use_default_move<T, Size, Rows, Cols>::value>
+class DenseStorage : public internal::DenseStorage_impl<T, Size, Rows, Cols, Options> {
+  using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
+      : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  // if DenseStorage meets the requirements of use_default_move, then use the move construction and move assignment
+  // operation defined in DenseStorage_impl, or the compiler-generated version if none is defined
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&&) = default;
+};
+template <typename T, int Size, int Rows, int Cols, int Options>
+class DenseStorage<T, Size, Rows, Cols, Options, false>
+    : public internal::DenseStorage_impl<T, Size, Rows, Cols, Options> {
+  using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
+      : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  // if DenseStorage does not meet the requirements of use_default_move, then defer to the copy construction and copy
+  // assignment behavior
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&& other)
+      : DenseStorage(static_cast<const DenseStorage&>(other)) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&& other) {
+    *this = other;
+    return *this;
   }
-  EIGEN_DEVICE_FUNC const T* data() const { return m_data; }
-  EIGEN_DEVICE_FUNC T* data() { return m_data; }
 };
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/DeviceWrapper.h b/Eigen/src/Core/DeviceWrapper.h
new file mode 100644
index 0000000..012dce1
--- /dev/null
+++ b/Eigen/src/Core/DeviceWrapper.h
@@ -0,0 +1,153 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DEVICEWRAPPER_H
+#define EIGEN_DEVICEWRAPPER_H
+
+namespace Eigen {
+template <typename Derived, typename Device>
+struct DeviceWrapper {
+  using Base = EigenBase<internal::remove_all_t<Derived>>;
+  using Scalar = typename Derived::Scalar;
+
+  EIGEN_DEVICE_FUNC DeviceWrapper(Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
+  EIGEN_DEVICE_FUNC DeviceWrapper(const Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived>& other) {
+    using AssignOp = internal::assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), AssignOp());
+    return m_xpr;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const EigenBase<OtherDerived>& other) {
+    using AddAssignOp = internal::add_assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), AddAssignOp());
+    return m_xpr;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const EigenBase<OtherDerived>& other) {
+    using SubAssignOp = internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), SubAssignOp());
+    return m_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return m_xpr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Device& device() { return m_device; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NoAlias<DeviceWrapper, EigenBase> noalias() {
+    return NoAlias<DeviceWrapper, EigenBase>(*this);
+  }
+
+  Derived& m_xpr;
+  Device& m_device;
+};
+
+namespace internal {
+
+// this is where we differentiate between lazy assignment and specialized kernels (e.g. matrix products)
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device,
+          typename Kind = typename AssignmentKind<typename evaluator_traits<DstXprType>::Shape,
+                                                  typename evaluator_traits<SrcXprType>::Shape>::Kind,
+          typename EnableIf = void>
+struct AssignmentWithDevice;
+
+// unless otherwise specified, use the default product implementation
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Functor, typename Device,
+          typename Weak>
+struct AssignmentWithDevice<DstXprType, Product<Lhs, Rhs, Options>, Functor, Device, Dense2Dense, Weak> {
+  using SrcXprType = Product<Lhs, Rhs, Options>;
+  using Base = Assignment<DstXprType, SrcXprType, Functor>;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
+                                                        Device&) {
+    Base::run(dst, src, func);
+  }
+};
+
+// specialization for coeffcient-wise assignment
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device, typename Weak>
+struct AssignmentWithDevice<DstXprType, SrcXprType, Functor, Device, Dense2Dense, Weak> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
+                                                        Device& device) {
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+
+    call_dense_assignment_loop(dst, src, func, device);
+  }
+};
+
+// this allows us to use the default evaluation scheme if it is not specialized for the device
+template <typename Kernel, typename Device, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop_with_device {
+  using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
+};
+
+// entry point for a generic expression with device
+template <typename Dst, typename Src, typename Func, typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
+                                                                              const Src& src, const Func& func) {
+  enum {
+    NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
+                       (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
+                      int(Dst::SizeAtCompileTime) != 1
+  };
+
+  using ActualDstTypeCleaned = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst>;
+  using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
+  ActualDstType actualDst(dst.derived());
+
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
+
+  // this provides a mechanism for specializing simple assignments, matrix products, etc
+  AssignmentWithDevice<ActualDstTypeCleaned, Src, Func, Device>::run(actualDst, src, func, dst.device());
+}
+
+// copy and pasted from AssignEvaluator except forward device to kernel
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func, Device& device) {
+  using DstEvaluatorType = evaluator<DstXprType>;
+  using SrcEvaluatorType = evaluator<SrcXprType>;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+
+  using Kernel = generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor>;
+
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop_with_device<Kernel, Device>::run(kernel, device);
+}
+
+}  // namespace internal
+
+template <typename Derived>
+template <typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> EigenBase<Derived>::device(Device& device) {
+  return DeviceWrapper<Derived, Device>(derived(), device);
+}
+
+template <typename Derived>
+template <typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> EigenBase<Derived>::device(
+    Device& device) const {
+  return DeviceWrapper<const Derived, Device>(derived(), device);
+}
+}  // namespace Eigen
+#endif
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 8d27857..ff8611c 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -83,13 +83,11 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
                                : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return 1; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_matrix.outerStride() + 1;
-  }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.outerStride() + 1; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 0; }
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
@@ -134,13 +132,13 @@ class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_
 
  private:
   // some compilers may fail to optimize std::max etc in case of compile-time constants...
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index absDiagIndex() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
     return m_index.value() > 0 ? m_index.value() : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowOffset() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
     return m_index.value() > 0 ? 0 : -m_index.value();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index colOffset() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
     return m_index.value() > 0 ? m_index.value() : 0;
   }
   // trigger a compile-time error if someone try to call packet
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index fd61bb7..52630d9 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -76,9 +76,9 @@ class DiagonalBase : public EigenBase<Derived> {
   }
 
   /** \returns the number of rows. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return diagonal().size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return diagonal().size(); }
   /** \returns the number of columns. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return diagonal().size(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return diagonal().size(); }
 
   /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */
   template <typename MatrixDerived>
@@ -256,10 +256,13 @@ class DiagonalMatrix : public DiagonalBase<DiagonalMatrix<Scalar_, SizeAtCompile
   typedef DiagonalWrapper<const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, DiagonalVectorType>>
       InitializeReturnType;
 
+  typedef DiagonalWrapper<const CwiseNullaryOp<internal::scalar_zero_op<Scalar>, DiagonalVectorType>>
+      ZeroInitializeReturnType;
+
   /** Initializes a diagonal matrix of size SizeAtCompileTime with coefficients set to zero */
-  EIGEN_DEVICE_FUNC static const InitializeReturnType Zero() { return DiagonalVectorType::Zero().asDiagonal(); }
+  EIGEN_DEVICE_FUNC static const ZeroInitializeReturnType Zero() { return DiagonalVectorType::Zero().asDiagonal(); }
   /** Initializes a diagonal matrix of size dim with coefficients set to zero */
-  EIGEN_DEVICE_FUNC static const InitializeReturnType Zero(Index size) {
+  EIGEN_DEVICE_FUNC static const ZeroInitializeReturnType Zero(Index size) {
     return DiagonalVectorType::Zero(size).asDiagonal();
   }
   /** Initializes a identity matrix of size SizeAtCompileTime */
@@ -386,8 +389,9 @@ struct AssignmentKind<DenseShape, DiagonalShape> {
 // Diagonal matrix to Dense assignment
 template <typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense> {
-  static void run(DstXprType& dst, const SrcXprType& src,
-                  const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
     Index dstRows = src.rows();
     Index dstCols = src.cols();
     if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
@@ -396,13 +400,15 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense> {
     dst.diagonal() = src.diagonal();
   }
 
-  static void run(DstXprType& dst, const SrcXprType& src,
-                  const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
     dst.diagonal() += src.diagonal();
   }
 
-  static void run(DstXprType& dst, const SrcXprType& src,
-                  const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
     dst.diagonal() -= src.diagonal();
   }
 };
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 82eb9c7..059527c 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -17,28 +17,18 @@ namespace Eigen {
 
 namespace internal {
 
-// helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
-// with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
-// looking at the static assertions. Thus this is a trick to get better compile errors.
-template <typename T, typename U,
-          bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime &&
-                                 ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) ||
-                                  (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))>
-struct dot_nocheck {
-  typedef scalar_conj_product_op<typename traits<T>::Scalar, typename traits<U>::Scalar> conj_prod;
-  typedef typename conj_prod::result_type ResScalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) {
-    return a.template binaryExpr<conj_prod>(b).sum();
+template <typename Derived, typename Scalar = typename traits<Derived>::Scalar>
+struct squared_norm_impl {
+  using Real = typename NumTraits<Scalar>::Real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) {
+    Scalar result = a.unaryExpr(squared_norm_functor<Scalar>()).sum();
+    return numext::real(result) + numext::imag(result);
   }
 };
 
-template <typename T, typename U>
-struct dot_nocheck<T, U, true> {
-  typedef scalar_conj_product_op<typename traits<T>::Scalar, typename traits<U>::Scalar> conj_prod;
-  typedef typename conj_prod::result_type ResScalar;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) {
-    return a.transpose().template binaryExpr<conj_prod>(b).sum();
-  }
+template <typename Derived>
+struct squared_norm_impl<Derived, bool> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
 };
 
 }  // end namespace internal
@@ -60,18 +50,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
                                   typename internal::traits<OtherDerived>::Scalar>::ReturnType
     MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived)
-#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
-  EIGEN_CHECK_BINARY_COMPATIBILIY(
-      Eigen::internal::scalar_conj_product_op<Scalar EIGEN_COMMA typename OtherDerived::Scalar>, Scalar,
-      typename OtherDerived::Scalar);
-#endif
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<Derived, OtherDerived>::run(*this, other);
+  return internal::dot_impl<Derived, OtherDerived>::run(derived(), other.derived());
 }
 
 //---------- implementation of L2 norm and related functions ----------
@@ -85,7 +64,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::squaredNorm() const {
-  return numext::real((*this).cwiseAbs2().sum());
+  return internal::squared_norm_impl<Derived>::run(derived());
 }
 
 /** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index f485016..c9a6e88 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -46,22 +46,22 @@ struct EigenBase {
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
 
   /** \returns a reference to the derived object */
-  EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast<Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr Derived& derived() { return *static_cast<Derived*>(this); }
   /** \returns a const reference to the derived object */
-  EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
-  EIGEN_DEVICE_FUNC inline Derived& const_cast_derived() const {
+  EIGEN_DEVICE_FUNC inline constexpr Derived& const_cast_derived() const {
     return *static_cast<Derived*>(const_cast<EigenBase*>(this));
   }
   EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template <typename Dest>
@@ -104,6 +104,11 @@ struct EigenBase {
     // derived class can reimplement it in a more optimized way.
     dst = this->derived() * dst;
   }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> device(Device& device);
+  template <typename Device>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> device(Device& device) const;
 };
 
 /***************************************************************************
diff --git a/Eigen/src/Core/Fill.h b/Eigen/src/Core/Fill.h
new file mode 100644
index 0000000..f40d56d
--- /dev/null
+++ b/Eigen/src/Core/Fill.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charles Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FILL_H
+#define EIGEN_FILL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Xpr>
+struct eigen_fill_helper : std::false_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ true>> : eigen_fill_helper<Xpr> {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ false>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       (Xpr::IsRowMajor ? (BlockRows == 1) : (BlockCols == 1))> {};
+
+template <typename Xpr, int Options>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<0, 0>>> : eigen_fill_helper<Xpr> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       enum_eq_not_dynamic(OuterStride_, Xpr::InnerSizeAtCompileTime)> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 1>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>> {};
+
+template <typename Xpr, int Options, int InnerStride_>
+struct eigen_fill_helper<Map<Xpr, Options, InnerStride<InnerStride_>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<0, InnerStride_>>> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, OuterStride<OuterStride_>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>> {};
+
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ false> {
+  using Scalar = typename Xpr::Scalar;
+  using Func = scalar_constant_op<Scalar>;
+  using PlainObject = typename Xpr::PlainObject;
+  using Constant = typename PlainObject::ConstantReturnType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const Scalar& val) {
+    const Constant src(dst.rows(), dst.cols(), val);
+    run(dst, src);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+    call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
+  }
+};
+
+#if EIGEN_COMP_MSVC || defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ true> : eigen_fill_impl<Xpr, /*use_fill*/ false> {};
+#else
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+    const Scalar val_copy = val;
+    using std::fill_n;
+    fill_n(dst.data(), dst.size(), val_copy);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+    resize_if_allowed(dst, src, assign_op<Scalar, Scalar>());
+    const Scalar& val = src.functor()();
+    run(dst, val);
+  }
+};
+#endif
+
+template <typename Xpr>
+struct eigen_memset_helper {
+  static constexpr bool value =
+      std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+};
+
+template <typename Xpr>
+struct eigen_zero_impl<Xpr, /*use_memset*/ false> {
+  using Scalar = typename Xpr::Scalar;
+  using PlainObject = typename Xpr::PlainObject;
+  using Zero = typename PlainObject::ZeroReturnType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst) {
+    const Zero src(dst.rows(), dst.cols());
+    run(dst, src);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+    call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
+  }
+};
+
+template <typename Xpr>
+struct eigen_zero_impl<Xpr, /*use_memset*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static constexpr size_t max_bytes = (std::numeric_limits<std::ptrdiff_t>::max)();
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) {
+    const size_t num_bytes = dst.size() * sizeof(Scalar);
+    if (num_bytes == 0) return;
+    void* dst_ptr = static_cast<void*>(dst.data());
+#ifndef EIGEN_NO_DEBUG
+    if (num_bytes > max_bytes) throw_std_bad_alloc();
+    eigen_assert((dst_ptr != nullptr) && "null pointer dereference error!");
+#endif
+    EIGEN_USING_STD(memset);
+    memset(dst_ptr, 0, num_bytes);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+    resize_if_allowed(dst, src, assign_op<Scalar, Scalar>());
+    run(dst);
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_FILL_H
diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h
new file mode 100644
index 0000000..0102e8a
--- /dev/null
+++ b/Eigen/src/Core/FindCoeff.h
@@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIND_COEFF_H
+#define EIGEN_FIND_COEFF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct max_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate > incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(incumbent, candidate);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate > incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct min_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate < incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(candidate, incumbent);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate < incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_max_traits {
+  static constexpr bool PacketAccess = packet_traits<Scalar>::Vectorizable;
+};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<min_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+
+template <typename Evaluator, typename Func, bool Linear, bool Vectorize>
+struct find_coeff_loop;
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& outer, Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    for (Index j = 0; j < outerSize; j++) {
+      for (Index i = 0; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        bool newRes = func.compareCoeff(res, xprCoeff);
+        if (newRes) {
+          outer = j;
+          inner = i;
+          res = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& index) {
+    Index size = eval.size();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    for (Index k = 0; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      bool newRes = func.compareCoeff(res, xprCoeff);
+      if (newRes) {
+        index = k;
+        res = xprCoeff;
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, false, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& outer,
+                                           Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+    Index packetEnd = numext::round_down(innerSize, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    bool checkPacket = false;
+
+    for (Index j = 0; j < outerSize; j++) {
+      Packet resultPacket = pset1<Packet>(result);
+      for (Index i = 0; i < packetEnd; i += PacketSize) {
+        Packet xprPacket = eval.template packetByOuterInner<Unaligned, Packet>(j, i);
+        if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+          outer = j;
+          inner = i;
+          result = func.predux(xprPacket);
+          resultPacket = pset1<Packet>(result);
+          checkPacket = true;
+        }
+      }
+
+      for (Index i = packetEnd; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          outer = j;
+          inner = i;
+          result = xprCoeff;
+          checkPacket = false;
+        }
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeffByOuterInner(outer, inner);
+      Index i_end = inner + PacketSize;
+      for (Index i = inner; i < i_end; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(outer, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          inner = i;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, true, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr int Alignment = Evaluator::Alignment;
+
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
+    Index size = eval.size();
+    Index packetEnd = numext::round_down(size, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    Packet resultPacket = pset1<Packet>(result);
+    bool checkPacket = false;
+
+    for (Index k = 0; k < packetEnd; k += PacketSize) {
+      Packet xprPacket = eval.template packet<Alignment, Packet>(k);
+      if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+        index = k;
+        result = func.predux(xprPacket);
+        resultPacket = pset1<Packet>(result);
+        checkPacket = true;
+      }
+    }
+
+    for (Index k = packetEnd; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      if (func.compareCoeff(result, xprCoeff)) {
+        index = k;
+        result = xprCoeff;
+        checkPacket = false;
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeff(index);
+      Index k_end = index + PacketSize;
+      for (Index k = index; k < k_end; k++) {
+        Scalar xprCoeff = eval.coeff(k);
+        if (func.compareCoeff(result, xprCoeff)) {
+          index = k;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+
+template <typename Derived>
+struct find_coeff_evaluator : public evaluator<Derived> {
+  using Base = evaluator<Derived>;
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int Flags = Base::Flags;
+  static constexpr bool IsRowMajor = bool(Flags & RowMajorBit);
+  EIGEN_DEVICE_FUNC inline find_coeff_evaluator(const Derived& xpr) : Base(xpr), m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar coeffByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::coeff(row, col);
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC inline PacketType packetByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::template packet<LoadMode, PacketType>(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC inline Index outerSize() const { return m_xpr.outerSize(); }
+  EIGEN_DEVICE_FUNC inline Index size() const { return m_xpr.size(); }
+
+  const Derived& m_xpr;
+};
+
+template <typename Derived, typename Func>
+struct find_coeff_impl {
+  using Evaluator = find_coeff_evaluator<Derived>;
+  static constexpr int Flags = Evaluator::Flags;
+  static constexpr int Alignment = Evaluator::Alignment;
+  static constexpr bool IsRowMajor = Derived::IsRowMajor;
+  static constexpr int MaxInnerSizeAtCompileTime =
+      IsRowMajor ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime;
+  static constexpr int MaxSizeAtCompileTime = Derived::MaxSizeAtCompileTime;
+
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename Evaluator::Packet;
+
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr bool Linearize = bool(Flags & LinearAccessBit);
+  static constexpr bool DontVectorize =
+      enum_lt_not_dynamic(Linearize ? MaxSizeAtCompileTime : MaxInnerSizeAtCompileTime, PacketSize);
+  static constexpr bool Vectorize =
+      !DontVectorize && bool(Flags & PacketAccessBit) && functor_traits<Func>::PacketAccess;
+
+  using Loop = find_coeff_loop<Evaluator, Func, Linearize, Vectorize>;
+
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<!ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, outer, inner);
+  }
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    // where possible, use the linear loop and back-calculate the outer and inner indices
+    Index index = 0;
+    run(xpr, func, res, index);
+    outer = index / xpr.innerSize();
+    inner = index % xpr.innerSize();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& index) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, index);
+  }
+};
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* rowPtr, IndexType* colPtr) {
+  eigen_assert(mat.rows() > 0 && mat.cols() > 0 && "you are using an empty matrix");
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index outer = 0;
+  Index inner = 0;
+  Scalar res = mat.coeff(0, 0);
+  FindCoeffImpl::run(mat.derived(), func, res, outer, inner);
+  *rowPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? outer : inner);
+  if (colPtr) *colPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? inner : outer);
+  return res;
+}
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* indexPtr) {
+  eigen_assert(mat.size() > 0 && "you are using an empty matrix");
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index index = 0;
+  Scalar res = mat.coeff(0);
+  FindCoeffImpl::run(mat.derived(), func, res, index);
+  *indexPtr = internal::convert_index<IndexType>(index);
+  return res;
+}
+
+}  // namespace internal
+
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the minimum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
+ * DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* indexPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
+ * DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* indexPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIND_COEFF_H
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index a91b0da..55beab3 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,14 +41,10 @@ class ForceAlignedAccess : public internal::dense_xpr_base<ForceAlignedAccess<Ex
 
   EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
-    return m_expression.outerStride();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
-    return m_expression.innerStride();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
 
   EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const {
     return m_expression.coeff(row, col);
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 1220073..e4c51d2 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -229,7 +229,7 @@ struct gemv_static_vector_if;
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() {
     eigen_internal_assert(false && "should never be called");
     return 0;
   }
@@ -237,23 +237,21 @@ struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
 
 template <typename Scalar, int Size>
 struct gemv_static_vector_if<Scalar, Size, Dynamic, true> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
 };
 
 template <typename Scalar, int Size, int MaxSize>
 struct gemv_static_vector_if<Scalar, Size, MaxSize, true> {
 #if EIGEN_MAX_STATIC_ALIGN_BYTES != 0
-  internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize), 0, AlignedMax>
-      m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize), 0, AlignedMax> m_data;
+  EIGEN_STRONG_INLINE constexpr Scalar* data() { return m_data.array; }
 #else
   // Some architectures cannot align on the stack,
   // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
-  internal::plain_array<
-      Scalar, internal::min_size_prefer_fixed(Size, MaxSize) + EIGEN_MAX_ALIGN_BYTES, 0>
-      m_data;
-  EIGEN_STRONG_INLINE Scalar* data() {
-    return reinterpret_cast<Scalar*>((std::uintptr_t(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES - 1))) + EIGEN_MAX_ALIGN_BYTES);
+  internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize) + EIGEN_MAX_ALIGN_BYTES, 0> m_data;
+  EIGEN_STRONG_INLINE constexpr Scalar* data() {
+    return reinterpret_cast<Scalar*>((std::uintptr_t(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES - 1))) +
+                                     EIGEN_MAX_ALIGN_BYTES);
   }
 #endif
 };
@@ -329,6 +327,7 @@ struct gemv_dense_selector<OnTheRight, ColMajor, true> {
 
       if (!evalToDest) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        constexpr int Size = Dest::SizeAtCompileTime;
         Index size = dest.size();
         EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
@@ -393,6 +392,7 @@ struct gemv_dense_selector<OnTheRight, RowMajor, true> {
 
     if (!DirectlyUseRhs) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
       Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 58a197f..64e1123 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -57,22 +57,22 @@ struct default_packet_traits {
     HasConj = 1,
     HasSetLinear = 1,
     HasSign = 1,
+    // By default, the nearest integer functions (rint, round, floor, ceil, trunc) are enabled for all scalar and packet
+    // types
+    HasRound = 1,
 
     HasArg = 0,
     HasAbsDiff = 0,
     HasBlend = 0,
     // This flag is used to indicate whether packet comparison is supported.
-    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    // pcmp_eq and pcmp_lt should be defined for it to be true.
     HasCmp = 0,
-    HasRound = 0,
-    HasRint = 0,
-    HasFloor = 0,
-    HasCeil = 0,
 
     HasDiv = 0,
     HasReciprocal = 0,
     HasSqrt = 0,
     HasRsqrt = 0,
+    HasCbrt = 0,
     HasExp = 0,
     HasExpm1 = 0,
     HasLog = 0,
@@ -135,7 +135,14 @@ template <typename T>
 struct unpacket_traits {
   typedef T type;
   typedef T half;
-  enum { size = 1, alignment = 1, vectorizable = false, masked_load_available = false, masked_store_available = false };
+  typedef typename numext::get_integer_by_size<sizeof(T)>::signed_type integer_packet;
+  enum {
+    size = 1,
+    alignment = alignof(T),
+    vectorizable = false,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 template <typename T>
@@ -188,7 +195,7 @@ struct is_half {
   static constexpr int Size = unpacket_traits<Packet>::size;
   using DefaultPacket = typename packet_traits<Scalar>::type;
   static constexpr int DefaultSize = unpacket_traits<DefaultPacket>::size;
-  static constexpr bool value = Size < DefaultSize;
+  static constexpr bool value = Size != 1 && Size < DefaultSize;
 };
 
 template <typename Src, typename Tgt>
@@ -201,7 +208,7 @@ struct type_casting_traits {
   };
 };
 
-// provides a succint template to define vectorized casting traits with respect to the largest accessible packet types
+// provides a succinct template to define vectorized casting traits with respect to the largest accessible packet types
 template <typename Src, typename Tgt>
 struct vectorized_type_casting_traits {
   enum : int {
@@ -246,6 +253,12 @@ struct preinterpret_generic<Packet, Packet, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& a) { return a; }
 };
 
+template <typename ComplexPacket>
+struct preinterpret_generic<typename unpacket_traits<ComplexPacket>::as_real, ComplexPacket, false> {
+  using RealPacket = typename unpacket_traits<ComplexPacket>::as_real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealPacket run(const ComplexPacket& a) { return a.v; }
+};
+
 /** \internal \returns reinterpret_cast<Target>(a) */
 template <typename Target, typename Packet>
 EIGEN_DEVICE_FUNC inline Target preinterpret(const Packet& a) {
@@ -335,12 +348,9 @@ EIGEN_DEVICE_FUNC inline Packet psub(const Packet& a, const Packet& b) {
 /** \internal \returns -a (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) {
-  return -a;
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool pnegate(const bool& a) {
-  return !a;
+  EIGEN_STATIC_ASSERT((!is_same<typename unpacket_traits<Packet>::type, bool>::value),
+                      NEGATE IS NOT DEFINED FOR BOOLEAN TYPES)
+  return numext::negate(a);
 }
 
 /** \internal \returns conj(a) (coeff-wise) */
@@ -365,8 +375,13 @@ template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) {
   return a / b;
 }
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
+  return a && b;
+}
 
-// In the generic case, memset to all one bits.
+// In the generic packet case, memset to all one bits.
 template <typename Packet, typename EnableIf = void>
 struct ptrue_impl {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@@ -376,19 +391,16 @@ struct ptrue_impl {
   }
 };
 
+// Use a value of one for scalars.
+template <typename Scalar>
+struct ptrue_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&) { return Scalar(1); }
+};
+
 // For booleans, we can only directly set a valid `bool` value to avoid UB.
 template <>
 struct ptrue_impl<bool, void> {
-  static EIGEN_DEVICE_FUNC inline bool run(const bool& /*a*/) { return true; }
-};
-
-// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
-// Although this is technically not a valid bitmask, the scalar path for pselect
-// uses a comparison to zero, so this should still work in most cases. We don't
-// have another option, since the scalar type requires initialization.
-template <typename T>
-struct ptrue_impl<T, std::enable_if_t<is_scalar<T>::value && NumTraits<T>::RequireInitialization>> {
-  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { return T(1); }
+  static EIGEN_DEVICE_FUNC inline bool run(const bool&) { return true; }
 };
 
 /** \internal \returns one bits. */
@@ -397,7 +409,7 @@ EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& a) {
   return ptrue_impl<Packet>::run(a);
 }
 
-// In the general case, memset to zero.
+// In the general packet case, memset to zero.
 template <typename Packet, typename EnableIf = void>
 struct pzero_impl {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
@@ -420,74 +432,44 @@ EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) {
   return pzero_impl<Packet>::run(a);
 }
 
-/** \internal \returns a <= b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_le(const Packet& a, const Packet& b) {
-  return a <= b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a < b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_lt(const Packet& a, const Packet& b) {
-  return a < b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a == b as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_eq(const Packet& a, const Packet& b) {
-  return a == b ? ptrue(a) : pzero(a);
-}
-
-/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) {
-  return a >= b ? pzero(a) : ptrue(a);
-}
-
 template <typename T>
 struct bit_and {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
 };
 
 template <typename T>
 struct bit_or {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
 };
 
 template <typename T>
 struct bit_xor {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
 };
 
 template <typename T>
 struct bit_not {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
 };
 
 template <>
 struct bit_and<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a && b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a && b; }
 };
 
 template <>
 struct bit_or<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a || b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a || b; }
 };
 
 template <>
 struct bit_xor<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const {
-    return a != b;
-  }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a != b; }
 };
 
 template <>
 struct bit_not<bool> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
 };
 
 // Use operators &, |, ^, ~.
@@ -576,14 +558,32 @@ EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) {
   return pand(a, pnot(b));
 }
 
-/** \internal \returns isnan(a) */
+/** \internal \returns a < b as a bit mask */
 template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pisnan(const Packet& a) {
-  return pandnot(ptrue(a), pcmp_eq(a, a));
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt(const Packet& a, const Packet& b) {
+  return a < b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a == b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_eq(const Packet& a, const Packet& b) {
+  return a == b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a <= b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_le(const Packet& a, const Packet& b) {
+  return por(pcmp_eq(a, b), pcmp_lt(a, b));
+}
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) {
+  return a >= b ? pzero(a) : ptrue(a);
 }
 
 // In the general case, use bitwise select.
-template <typename Packet, typename EnableIf = void>
+template <typename Packet, bool is_scalar = is_scalar<Packet>::value>
 struct pselect_impl {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
     return por(pand(a, mask), pandnot(b, mask));
@@ -592,9 +592,9 @@ struct pselect_impl {
 
 // For scalars, use ternary select.
 template <typename Packet>
-struct pselect_impl<Packet, std::enable_if_t<is_scalar<Packet>::value>> {
+struct pselect_impl<Packet, true> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
-    return numext::equal_strict(mask, Packet(0)) ? b : a;
+    return numext::select(mask, a, b);
   }
 };
 
@@ -611,7 +611,7 @@ EIGEN_DEVICE_FUNC inline bool pselect<bool>(const bool& cond, const bool& a, con
 
 /** \internal \returns the min or of \a a and \a b (coeff-wise)
     If either \a a or \a b are NaN, the result is implementation defined. */
-template <int NaNPropagation>
+template <int NaNPropagation, bool IsInteger>
 struct pminmax_impl {
   template <typename Packet, typename Op>
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
@@ -622,7 +622,7 @@ struct pminmax_impl {
 /** \internal \returns the min or max of \a a and \a b (coeff-wise)
     If either \a a or \a b are NaN, NaN is returned. */
 template <>
-struct pminmax_impl<PropagateNaN> {
+struct pminmax_impl<PropagateNaN, false> {
   template <typename Packet, typename Op>
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
     Packet not_nan_mask_a = pcmp_eq(a, a);
@@ -635,7 +635,7 @@ struct pminmax_impl<PropagateNaN> {
     If both \a a and \a b are NaN, NaN is returned.
     Equivalent to std::fmin(a, b).  */
 template <>
-struct pminmax_impl<PropagateNumbers> {
+struct pminmax_impl<PropagateNumbers, false> {
   template <typename Packet, typename Op>
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
     Packet not_nan_mask_a = pcmp_eq(a, a);
@@ -644,7 +644,7 @@ struct pminmax_impl<PropagateNumbers> {
   }
 };
 
-#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& a, const Type& b) { return Func(a, b); }
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& aa, const Type& bb) { return Func(aa, bb); }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise).
     If \a a or \b b is NaN, the return value is implementation defined. */
@@ -657,7 +657,8 @@ EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
     NaNPropagation determines the NaN propagation semantics. */
 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
-  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
 }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise)
@@ -671,7 +672,8 @@ EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
     NaNPropagation determines the NaN propagation semantics. */
 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
-  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
 }
 
 /** \internal \returns the absolute value of \a a */
@@ -706,33 +708,21 @@ EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) {
 }
 
 /** \internal \returns \a a arithmetically shifted by N bits to the right */
-template <int N>
-EIGEN_DEVICE_FUNC inline int parithmetic_shift_right(const int& a) {
-  return a >> N;
-}
-template <int N>
-EIGEN_DEVICE_FUNC inline long int parithmetic_shift_right(const long int& a) {
-  return a >> N;
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T parithmetic_shift_right(const T& a) {
+  return numext::arithmetic_shift_right(a, N);
 }
 
 /** \internal \returns \a a logically shifted by N bits to the right */
-template <int N>
-EIGEN_DEVICE_FUNC inline int plogical_shift_right(const int& a) {
-  return static_cast<int>(static_cast<unsigned int>(a) >> N);
-}
-template <int N>
-EIGEN_DEVICE_FUNC inline long int plogical_shift_right(const long int& a) {
-  return static_cast<long>(static_cast<unsigned long>(a) >> N);
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T plogical_shift_right(const T& a) {
+  return numext::logical_shift_right(a, N);
 }
 
 /** \internal \returns \a a shifted by N bits to the left */
-template <int N>
-EIGEN_DEVICE_FUNC inline int plogical_shift_left(const int& a) {
-  return a << N;
-}
-template <int N>
-EIGEN_DEVICE_FUNC inline long int plogical_shift_left(const long int& a) {
-  return a << N;
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T plogical_shift_left(const T& a) {
+  return numext::logical_shift_left(a, N);
 }
 
 /** \internal \returns the significant and exponent of the underlying floating point numbers
@@ -888,17 +878,29 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_trait
   return a;
 }
 
+template <typename Packet, typename EnableIf = void>
+struct peven_mask_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet&) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    const size_t n = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+    for (size_t i = 0; i < n; ++i) {
+      memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+    }
+    return ploadu<Packet>(elements);
+  }
+};
+
+template <typename Scalar>
+struct peven_mask_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar&) { return Scalar(1); }
+};
+
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
      where x is the value of all 1-bits. */
 template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& /*a*/) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  const size_t n = unpacket_traits<Packet>::size;
-  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
-  for (size_t i = 0; i < n; ++i) {
-    memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
-  }
-  return ploadu<Packet>(elements);
+EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& a) {
+  return peven_mask_impl<Packet>::run(a);
 }
 
 /** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
@@ -1011,6 +1013,20 @@ EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) {
  * Special math functions
  ***************************/
 
+/** \internal \returns isnan(a) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pisnan(const Packet& a) {
+  return pandnot(ptrue(a), pcmp_eq(a, a));
+}
+
+/** \internal \returns isinf(a) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pisinf(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  constexpr Scalar inf = NumTraits<Scalar>::infinity();
+  return pcmp_eq(pabs(a), pset1<Packet>(inf));
+}
+
 /** \internal \returns the sine of \a a (coeff-wise) */
 template <typename Packet>
 EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin(const Packet& a) {
@@ -1084,8 +1100,13 @@ EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh(const Packet&
 /** \internal \returns the exp of \a a (coeff-wise) */
 template <typename Packet>
 EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) {
-  EIGEN_USING_STD(exp);
-  return exp(a);
+  return numext::exp(a);
+}
+
+/** \internal \returns the exp2 of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp2(const Packet& a) {
+  return numext::exp2(a);
 }
 
 /** \internal \returns the expm1 of \a a (coeff-wise) */
@@ -1114,11 +1135,12 @@ EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10(const Packet&
   return log10(a);
 }
 
-/** \internal \returns the log10 of \a a (coeff-wise) */
+/** \internal \returns the log2 of \a a (coeff-wise) */
 template <typename Packet>
 EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet& a) {
-  typedef typename internal::unpacket_traits<Packet>::type Scalar;
-  return pmul(pset1<Packet>(Scalar(EIGEN_LOG2E)), plog(a));
+  using Scalar = typename internal::unpacket_traits<Packet>::type;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  return pmul(pset1<Packet>(Scalar(RealScalar(EIGEN_LOG2E))), plog(a));
 }
 
 /** \internal \returns the square-root of \a a (coeff-wise) */
@@ -1133,33 +1155,45 @@ EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt(const Packet&
   return numext::cbrt(a);
 }
 
+template <typename Packet, bool IsScalar = is_scalar<Packet>::value,
+          bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+struct nearest_integer_packetop_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return numext::floor(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return numext::ceil(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return numext::rint(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return numext::round(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return numext::trunc(x); }
+};
+
 /** \internal \returns the rounded value of \a a (coeff-wise) */
 template <typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pround(const Packet& a) {
-  using numext::round;
-  return round(a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pround(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_round(a);
 }
 
 /** \internal \returns the floor of \a a (coeff-wise) */
 template <typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet& a) {
-  using numext::floor;
-  return floor(a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pfloor(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_floor(a);
 }
 
 /** \internal \returns the rounded value of \a a (coeff-wise) with current
  * rounding mode */
 template <typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet print(const Packet& a) {
-  using numext::rint;
-  return rint(a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet print(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_rint(a);
 }
 
 /** \internal \returns the ceil of \a a (coeff-wise) */
 template <typename Packet>
-EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) {
-  using numext::ceil;
-  return ceil(a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pceil(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_ceil(a);
+}
+
+/** \internal \returns the truncation of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ptrunc(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_trunc(a);
 }
 
 template <typename Packet, typename EnableIf = void>
@@ -1227,26 +1261,46 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
-}
-
-template <int NaNPropagation, typename Packet>
-EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<Scalar>)));
 }
 
-/** \internal \returns the min of the elements of \a a */
+/** \internal \returns the max of the elements of \a a */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+struct predux_min_max_helper_impl {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static constexpr bool UsePredux_ = NaNPropagation == PropagateFast || NumTraits<Scalar>::IsInteger;
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_min(a);
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_max(a);
+  }
+};
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_min(a);
 }
 
 template <int NaNPropagation, typename Packet>
 EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_max(a);
 }
 
 #undef EIGEN_BINARY_OP_NAN_PROPAGATION
@@ -1277,29 +1331,61 @@ EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) {
  * The following functions might not have to be overwritten for vectorized types
  ***************************************************************************/
 
-// FMA instructions.
+template <typename Packet, typename EnableIf = void>
+struct pmadd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return padd(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(c, pmul(a, b));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return pnegate(pmadd(a, b, c));
+  }
+};
+
+template <typename Scalar>
+struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(a, b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(a, b, Scalar(-c));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(Scalar(-a), b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return -Scalar(numext::madd<Scalar>(a, b, c));
+  }
+};
+
+// Multiply-add instructions.
 /** \internal \returns a * b + c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
-  return padd(pmul(a, b), c);
+  return pmadd_impl<Packet>::pmadd(a, b, c);
 }
 
 /** \internal \returns a * b - c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
-  return psub(pmul(a, b), c);
+  return pmadd_impl<Packet>::pmsub(a, b, c);
 }
 
 /** \internal \returns -(a * b) + c (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
-  return padd(pnegate(pmul(a, b)), c);
+  return pmadd_impl<Packet>::pnmadd(a, b, c);
 }
 
-/** \internal \returns -(a * b) - c (coeff-wise) */
+/** \internal \returns -((a * b + c) (coeff-wise) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
-  return psub(pnegate(pmul(a, b)), c);
+  return pmadd_impl<Packet>::pnmsub(a, b, c);
 }
 
 /** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned
@@ -1508,6 +1594,107 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
   return (Packet)pand(result, peven_mask(result));  // atan2 0    atan2 0    ...
 }
 
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize] = {};
+  for (Index k = begin; k < begin + count; k++) {
+    aux[k] = from[k];
+  }
+  return ploadu<Packet>(aux);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from must be aligned, and cannot be null.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                             Index count) {
+  return ploaduSegment<Packet>(from, begin, count);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
+null if \a count is zero.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize];
+  pstoreu<Scalar, Packet>(aux, from);
+  for (Index k = begin; k < begin + count; k++) {
+    to[k] = aux[k];
+  }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
+null.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  return pstoreuSegment(to, from, begin, count);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined.*/
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    return ploadSegment<Packet>(from, begin, count);
+  } else {
+    return ploaduSegment<Packet>(from, begin, count);
+  }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined.*/
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    pstoreSegment<Scalar, Packet>(to, from, begin, count);
+  } else {
+    pstoreuSegment<Scalar, Packet>(to, from, begin, count);
+  }
+}
+
+#ifndef EIGEN_NO_IO
+
+template <typename Packet>
+class StreamablePacket {
+ public:
+  using Scalar = typename unpacket_traits<Packet>::type;
+  StreamablePacket(const Packet& packet) { pstoreu(v_, packet); }
+
+  friend std::ostream& operator<<(std::ostream& os, const StreamablePacket& packet) {
+    os << "{" << packet.v_[0];
+    for (int i = 1; i < unpacket_traits<Packet>::size; ++i) {
+      os << "," << packet.v_[i];
+    }
+    os << "}";
+    return os;
+  }
+
+ private:
+  Scalar v_[unpacket_traits<Packet>::size];
+};
+
+/**
+ * \internal \returns an intermediary that can be used to ostream packets, e.g. for debugging.
+ */
+template <typename Packet>
+StreamablePacket<Packet> postream(const Packet& packet) {
+  return StreamablePacket<Packet>(packet);
+}
+
+#endif  // EIGEN_NO_IO
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index f0ae5a8..df1098e 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -76,6 +76,7 @@ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf, scalar_erf_op, error function,\sa ArrayBas
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc, scalar_erfc_op, complement error function,\sa ArrayBase::erfc)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri, scalar_ndtri_op, inverse normal distribution function,\sa ArrayBase::ndtri)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp, scalar_exp_op, exponential,\sa ArrayBase::exp)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp2, scalar_exp2_op, exponential,\sa ArrayBase::exp2)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1, scalar_expm1_op, exponential of a value minus 1,\sa ArrayBase::expm1)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log, scalar_log_op, natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p, scalar_log1p_op, natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
@@ -98,9 +99,12 @@ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint, scalar_rint_op,
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round, scalar_round_op,
                                  nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
-    floor, scalar_floor_op, nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+    floor, scalar_floor_op, nearest integer not greater than the given value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
-    ceil, scalar_ceil_op, nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+    ceil, scalar_ceil_op, nearest integer not less than the given value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(trunc, scalar_trunc_op,
+                                 nearest integer not greater in magnitude than the given value,\sa Eigen::trunc DOXCOMMA
+                                     ArrayBase::trunc)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
     isnan, scalar_isnan_op, not -a - number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
 EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index ca5f247..0a1b583 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -68,8 +68,8 @@ struct IOFormat {
     // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
     if ((flags & DontAlignCols)) return;
-    int i = int(matSuffix.length()) - 1;
-    while (i >= 0 && matSuffix[i] != '\n') {
+    int i = int(matPrefix.length()) - 1;
+    while (i >= 0 && matPrefix[i] != '\n') {
       rowSpacer += ' ';
       i--;
     }
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index b90ecb1..10562c1 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -20,8 +20,8 @@ namespace internal {
 template <typename XprType, typename RowIndices, typename ColIndices>
 struct traits<IndexedView<XprType, RowIndices, ColIndices>> : traits<XprType> {
   enum {
-    RowsAtCompileTime = int(array_size<RowIndices>::value),
-    ColsAtCompileTime = int(array_size<ColIndices>::value),
+    RowsAtCompileTime = int(IndexedViewHelper<RowIndices>::SizeAtCompileTime),
+    ColsAtCompileTime = int(IndexedViewHelper<ColIndices>::SizeAtCompileTime),
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
 
@@ -30,8 +30,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices>> : traits<XprType> {
                  : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1) ? 0
                                                                             : XprTypeIsRowMajor,
 
-    RowIncr = int(get_compile_time_incr<RowIndices>::value),
-    ColIncr = int(get_compile_time_incr<ColIndices>::value),
+    RowIncr = int(IndexedViewHelper<RowIndices>::IncrAtCompileTime),
+    ColIncr = int(IndexedViewHelper<ColIndices>::IncrAtCompileTime),
     InnerIncr = IsRowMajor ? ColIncr : RowIncr,
     OuterIncr = IsRowMajor ? RowIncr : ColIncr,
 
@@ -47,24 +47,23 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices>> : traits<XprType> {
                     is_same<AllRange<InnerSize>, std::conditional_t<XprTypeIsRowMajor, ColIndices, RowIndices>>::value,
 
     InnerStrideAtCompileTime =
-        InnerIncr < 0 || InnerIncr == DynamicIndex || XprInnerStride == Dynamic || InnerIncr == UndefinedIncr
+        InnerIncr < 0 || InnerIncr == DynamicIndex || XprInnerStride == Dynamic || InnerIncr == Undefined
             ? Dynamic
             : XprInnerStride * InnerIncr,
     OuterStrideAtCompileTime =
-        OuterIncr < 0 || OuterIncr == DynamicIndex || XprOuterstride == Dynamic || OuterIncr == UndefinedIncr
+        OuterIncr < 0 || OuterIncr == DynamicIndex || XprOuterstride == Dynamic || OuterIncr == Undefined
             ? Dynamic
             : XprOuterstride * OuterIncr,
 
-    ReturnAsScalar = is_same<RowIndices, SingleRange>::value && is_same<ColIndices, SingleRange>::value,
+    ReturnAsScalar = is_single_range<RowIndices>::value && is_single_range<ColIndices>::value,
     ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
     ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
 
     // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
     // but this is too strict regarding negative strides...
-    DirectAccessMask =
-        (int(InnerIncr) != UndefinedIncr && int(OuterIncr) != UndefinedIncr && InnerIncr >= 0 && OuterIncr >= 0)
-            ? DirectAccessBit
-            : 0,
+    DirectAccessMask = (int(InnerIncr) != Undefined && int(OuterIncr) != Undefined && InnerIncr >= 0 && OuterIncr >= 0)
+                           ? DirectAccessBit
+                           : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
     FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
@@ -153,10 +152,10 @@ class IndexedViewImpl : public internal::generic_xpr_base<IndexedView<XprType, R
       : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) {}
 
   /** \returns number of rows */
-  Index rows() const { return internal::index_list_size(m_rowIndices); }
+  Index rows() const { return IndexedViewHelper<RowIndices>::size(m_rowIndices); }
 
   /** \returns number of columns */
-  Index cols() const { return internal::index_list_size(m_colIndices); }
+  Index cols() const { return IndexedViewHelper<ColIndices>::size(m_colIndices); }
 
   /** \returns the nested expression */
   const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
@@ -198,16 +197,16 @@ class IndexedViewImpl<XprType, RowIndices, ColIndices, StorageKind, true>
   IndexedViewImpl(XprType& xpr, const T0& rowIndices, const T1& colIndices) : Base(xpr, rowIndices, colIndices) {}
 
   Index rowIncrement() const {
-    if (traits<Derived>::RowIncr != DynamicIndex && traits<Derived>::RowIncr != UndefinedIncr) {
+    if (traits<Derived>::RowIncr != DynamicIndex && traits<Derived>::RowIncr != Undefined) {
       return traits<Derived>::RowIncr;
     }
-    return get_runtime_incr(this->rowIndices());
+    return IndexedViewHelper<RowIndices>::incr(this->rowIndices());
   }
   Index colIncrement() const {
-    if (traits<Derived>::ColIncr != DynamicIndex && traits<Derived>::ColIncr != UndefinedIncr) {
+    if (traits<Derived>::ColIncr != DynamicIndex && traits<Derived>::ColIncr != Undefined) {
       return traits<Derived>::ColIncr;
     }
-    return get_runtime_incr(this->colIndices());
+    return IndexedViewHelper<ColIndices>::incr(this->colIndices());
   }
 
   Index innerIncrement() const { return traits<Derived>::IsRowMajor ? colIncrement() : rowIncrement(); }
@@ -226,14 +225,14 @@ class IndexedViewImpl<XprType, RowIndices, ColIndices, StorageKind, true>
     return this->nestedExpression().data() + row_offset + col_offset;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
     if (traits<Derived>::InnerStrideAtCompileTime != Dynamic) {
       return traits<Derived>::InnerStrideAtCompileTime;
     }
     return innerIncrement() * this->nestedExpression().innerStride();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
     if (traits<Derived>::OuterStrideAtCompileTime != Dynamic) {
       return traits<Derived>::OuterStrideAtCompileTime;
     }
@@ -309,6 +308,12 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
   const XprType& m_xpr;
 };
 
+// Catch assignments to an IndexedView.
+template <typename ArgType, typename RowIndices, typename ColIndices>
+struct evaluator_assume_aliasing<IndexedView<ArgType, RowIndices, ColIndices>> {
+  static const bool value = true;
+};
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/InnerProduct.h b/Eigen/src/Core/InnerProduct.h
new file mode 100644
index 0000000..9849d9b
--- /dev/null
+++ b/Eigen/src/Core/InnerProduct.h
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INNER_PRODUCT_EVAL_H
+#define EIGEN_INNER_PRODUCT_EVAL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// recursively searches for the largest simd type that does not exceed Size, or the smallest if no such type exists
+template <typename Scalar, int Size, typename Packet = typename packet_traits<Scalar>::type,
+          bool Stop =
+              (unpacket_traits<Packet>::size <= Size) || is_same<Packet, typename unpacket_traits<Packet>::half>::value>
+struct find_inner_product_packet_helper;
+
+template <typename Scalar, int Size, typename Packet>
+struct find_inner_product_packet_helper<Scalar, Size, Packet, false> {
+  using type = typename find_inner_product_packet_helper<Scalar, Size, typename unpacket_traits<Packet>::half>::type;
+};
+
+template <typename Scalar, int Size, typename Packet>
+struct find_inner_product_packet_helper<Scalar, Size, Packet, true> {
+  using type = Packet;
+};
+
+template <typename Scalar, int Size>
+struct find_inner_product_packet : find_inner_product_packet_helper<Scalar, Size> {};
+
+template <typename Scalar>
+struct find_inner_product_packet<Scalar, Dynamic> {
+  using type = typename packet_traits<Scalar>::type;
+};
+
+template <typename Lhs, typename Rhs>
+struct inner_product_assert {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Lhs)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Rhs)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Lhs, Rhs)
+#ifndef EIGEN_NO_DEBUG
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, const Rhs& rhs) {
+    eigen_assert((lhs.size() == rhs.size()) && "Inner product: lhs and rhs vectors must have same size");
+  }
+#else
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, const Rhs&) {}
+#endif
+};
+
+template <typename Func, typename Lhs, typename Rhs>
+struct inner_product_evaluator {
+  static constexpr int LhsFlags = evaluator<Lhs>::Flags;
+  static constexpr int RhsFlags = evaluator<Rhs>::Flags;
+  static constexpr int SizeAtCompileTime = size_prefer_fixed(Lhs::SizeAtCompileTime, Rhs::SizeAtCompileTime);
+  static constexpr int MaxSizeAtCompileTime =
+      min_size_prefer_fixed(Lhs::MaxSizeAtCompileTime, Rhs::MaxSizeAtCompileTime);
+  static constexpr int LhsAlignment = evaluator<Lhs>::Alignment;
+  static constexpr int RhsAlignment = evaluator<Rhs>::Alignment;
+
+  using Scalar = typename Func::result_type;
+  using Packet = typename find_inner_product_packet<Scalar, SizeAtCompileTime>::type;
+
+  static constexpr bool Vectorize =
+      bool(LhsFlags & RhsFlags & PacketAccessBit) && Func::PacketAccess &&
+      ((MaxSizeAtCompileTime == Dynamic) || (unpacket_traits<Packet>::size <= MaxSizeAtCompileTime));
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit inner_product_evaluator(const Lhs& lhs, const Rhs& rhs,
+                                                                         Func func = Func())
+      : m_func(func), m_lhs(lhs), m_rhs(rhs), m_size(lhs.size()) {
+    inner_product_assert<Lhs, Rhs>::run(lhs, rhs);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_size.value(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
+    return m_func.coeff(m_lhs.coeff(index), m_rhs.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& value, Index index) const {
+    return m_func.coeff(value, m_lhs.coeff(index), m_rhs.coeff(index));
+  }
+
+  template <typename PacketType, int LhsMode = LhsAlignment, int RhsMode = RhsAlignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_func.packet(m_lhs.template packet<LhsMode, PacketType>(index),
+                         m_rhs.template packet<RhsMode, PacketType>(index));
+  }
+
+  template <typename PacketType, int LhsMode = LhsAlignment, int RhsMode = RhsAlignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(const PacketType& value, Index index) const {
+    return m_func.packet(value, m_lhs.template packet<LhsMode, PacketType>(index),
+                         m_rhs.template packet<RhsMode, PacketType>(index));
+  }
+
+  const Func m_func;
+  const evaluator<Lhs> m_lhs;
+  const evaluator<Rhs> m_rhs;
+  const variable_if_dynamic<Index, SizeAtCompileTime> m_size;
+};
+
+template <typename Evaluator, bool Vectorize = Evaluator::Vectorize>
+struct inner_product_impl;
+
+// scalar loop
+template <typename Evaluator>
+struct inner_product_impl<Evaluator, false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval) {
+    const Index size = eval.size();
+    if (size == 0) return Scalar(0);
+
+    Scalar result = eval.coeff(0);
+    for (Index k = 1; k < size; k++) {
+      result = eval.coeff(result, k);
+    }
+
+    return result;
+  }
+};
+
+// vector loop
+template <typename Evaluator>
+struct inner_product_impl<Evaluator, true> {
+  using UnsignedIndex = std::make_unsigned_t<Index>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval) {
+    const UnsignedIndex size = static_cast<UnsignedIndex>(eval.size());
+    if (size < PacketSize) return inner_product_impl<Evaluator, false>::run(eval);
+
+    const UnsignedIndex packetEnd = numext::round_down(size, PacketSize);
+    const UnsignedIndex quadEnd = numext::round_down(size, 4 * PacketSize);
+    const UnsignedIndex numPackets = size / PacketSize;
+    const UnsignedIndex numRemPackets = (packetEnd - quadEnd) / PacketSize;
+
+    Packet presult0, presult1, presult2, presult3;
+
+    presult0 = eval.template packet<Packet>(0 * PacketSize);
+    if (numPackets >= 2) presult1 = eval.template packet<Packet>(1 * PacketSize);
+    if (numPackets >= 3) presult2 = eval.template packet<Packet>(2 * PacketSize);
+    if (numPackets >= 4) {
+      presult3 = eval.template packet<Packet>(3 * PacketSize);
+
+      for (UnsignedIndex k = 4 * PacketSize; k < quadEnd; k += 4 * PacketSize) {
+        presult0 = eval.packet(presult0, k + 0 * PacketSize);
+        presult1 = eval.packet(presult1, k + 1 * PacketSize);
+        presult2 = eval.packet(presult2, k + 2 * PacketSize);
+        presult3 = eval.packet(presult3, k + 3 * PacketSize);
+      }
+
+      if (numRemPackets >= 1) presult0 = eval.packet(presult0, quadEnd + 0 * PacketSize);
+      if (numRemPackets >= 2) presult1 = eval.packet(presult1, quadEnd + 1 * PacketSize);
+      if (numRemPackets == 3) presult2 = eval.packet(presult2, quadEnd + 2 * PacketSize);
+
+      presult2 = padd(presult2, presult3);
+    }
+
+    if (numPackets >= 3) presult1 = padd(presult1, presult2);
+    if (numPackets >= 2) presult0 = padd(presult0, presult1);
+
+    Scalar result = predux(presult0);
+    for (UnsignedIndex k = packetEnd; k < size; k++) {
+      result = eval.coeff(result, k);
+    }
+
+    return result;
+  }
+};
+
+template <typename Scalar, bool Conj>
+struct conditional_conj;
+
+template <typename Scalar>
+struct conditional_conj<Scalar, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a) { return numext::conj(a); }
+  template <typename Packet>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a) {
+    return pconj(a);
+  }
+};
+
+template <typename Scalar>
+struct conditional_conj<Scalar, false> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a) { return a; }
+  template <typename Packet>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a) {
+    return a;
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool Conj>
+struct scalar_inner_product_op {
+  using result_type = typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType;
+  using conj_helper = conditional_conj<LhsScalar, Conj>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type coeff(const LhsScalar& a, const RhsScalar& b) const {
+    return (conj_helper::coeff(a) * b);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type coeff(const result_type& accum, const LhsScalar& a,
+                                                          const RhsScalar& b) const {
+    return (conj_helper::coeff(a) * b) + accum;
+  }
+  static constexpr bool PacketAccess = false;
+};
+
+template <typename Scalar, bool Conj>
+struct scalar_inner_product_op<Scalar, Scalar, Conj> {
+  using result_type = Scalar;
+  using conj_helper = conditional_conj<Scalar, Conj>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a, const Scalar& b) const {
+    return pmul(conj_helper::coeff(a), b);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& accum, const Scalar& a, const Scalar& b) const {
+    return pmadd(conj_helper::coeff(a), b, accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a, const Packet& b) const {
+    return pmul(conj_helper::packet(a), b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& accum, const Packet& a, const Packet& b) const {
+    return pmadd(conj_helper::packet(a), b, accum);
+  }
+  static constexpr bool PacketAccess = packet_traits<Scalar>::HasMul && packet_traits<Scalar>::HasAdd;
+};
+
+template <typename Lhs, typename Rhs, bool Conj>
+struct default_inner_product_impl {
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using Op = scalar_inner_product_op<LhsScalar, RhsScalar, Conj>;
+  using Evaluator = inner_product_evaluator<Op, Lhs, Rhs>;
+  using result_type = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type run(const MatrixBase<Lhs>& a, const MatrixBase<Rhs>& b) {
+    Evaluator eval(a.derived(), b.derived(), Op());
+    return inner_product_impl<Evaluator>::run(eval);
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct dot_impl : default_inner_product_impl<Lhs, Rhs, true> {};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_INNER_PRODUCT_EVAL_H
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index cfb3b20..79fc3ab 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -51,8 +51,8 @@ class Inverse : public InverseImpl<XprType, typename internal::traits<XprType>::
 
   explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
 
   EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
@@ -92,7 +92,7 @@ struct unary_evaluator<Inverse<ArgType> > : public evaluator<typename Inverse<Ar
 
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
 
-  unary_evaluator(const InverseType& inv_xpr) : m_result(inv_xpr.rows(), inv_xpr.cols()) {
+  EIGEN_DEVICE_FUNC unary_evaluator(const InverseType& inv_xpr) : m_result(inv_xpr.rows(), inv_xpr.cols()) {
     internal::construct_at<Base>(this, m_result);
     internal::call_assignment_no_alias(m_result, inv_xpr);
   }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index df7b7ca..c740da7 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -102,11 +102,11 @@ class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
   typedef PointerType PointerArgType;
   EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
            : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic
                ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index da95b5c..5e3d746 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -84,9 +84,9 @@ class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Deri
   typedef typename Base::CoeffReturnType CoeffReturnType;
 
   /** \copydoc DenseBase::rows() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_rows.value(); }
   /** \copydoc DenseBase::cols() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_cols.value(); }
 
   /** Returns a pointer to the first coefficient of the matrix or vector.
    *
@@ -94,7 +94,7 @@ class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Deri
    *
    * \sa innerStride(), outerStride()
    */
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   /** \copydoc PlainObjectBase::coeff(Index,Index) const */
   EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index rowId, Index colId) const {
@@ -233,8 +233,8 @@ class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccesso
 
   typedef std::conditional_t<internal::is_lvalue<Derived>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return this->m_data; }
-  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() {
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return this->m_data; }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() {
     return this->m_data;
   }  // no const-cast here so non-const-correct code will give a compile error
 
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 3f28068..155fdad 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -170,8 +170,8 @@ struct imag_ref_default_impl {
 
 template <typename Scalar>
 struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
 };
 
 template <typename Scalar>
@@ -182,6 +182,40 @@ struct imag_ref_retval {
   typedef typename NumTraits<Scalar>::Real& type;
 };
 
+}  // namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline internal::add_const_on_value_type_t<EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)> real_ref(
+    const Scalar& x) {
+  return internal::real_ref_impl<Scalar>::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Scalar& mask, const Scalar& a, const Scalar& b) {
+  return numext::is_exactly_zero(mask) ? b : a;
+}
+
+}  // namespace numext
+
+namespace internal {
+
 /****************************************************************************
  * Implementation of conj                                                 *
  ****************************************************************************/
@@ -221,7 +255,9 @@ template <typename Scalar>
 struct abs2_impl_default<Scalar, true>  // IsComplex
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { return x.real() * x.real() + x.imag() * x.imag(); }
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    return numext::real(x) * numext::real(x) + numext::imag(x) * numext::imag(x);
+  }
 };
 
 template <typename Scalar>
@@ -250,16 +286,14 @@ struct sqrt_impl {
 };
 
 // Complex sqrt defined in MathFunctionsImpl.h.
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& a_x);
 
 // Custom implementation is faster than `std::sqrt`, works on
 // GPU, and correctly handles special cases (unlike MSVC).
 template <typename T>
 struct sqrt_impl<std::complex<T>> {
-  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x) {
-    return complex_sqrt<T>(x);
-  }
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x) { return complex_sqrt(x); }
 };
 
 template <typename Scalar>
@@ -272,13 +306,13 @@ template <typename T>
 struct rsqrt_impl;
 
 // Complex rsqrt defined in MathFunctionsImpl.h.
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& a_x);
 
 template <typename T>
 struct rsqrt_impl<std::complex<T>> {
   EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x) {
-    return complex_rsqrt<T>(x);
+    return complex_rsqrt(x);
   }
 };
 
@@ -299,7 +333,7 @@ struct norm1_default_impl<Scalar, true> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
     EIGEN_USING_STD(abs);
-    return abs(x.real()) + abs(x.imag());
+    return abs(numext::real(x)) + abs(numext::imag(x));
   }
 };
 
@@ -469,8 +503,8 @@ struct expm1_retval {
  ****************************************************************************/
 
 // Complex log defined in MathFunctionsImpl.h.
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z);
 
 template <typename Scalar>
 struct log_impl {
@@ -604,7 +638,6 @@ template <typename BitsType, typename EnableIf = void>
 struct count_bits_impl {
   static_assert(std::is_integral<BitsType>::value && std::is_unsigned<BitsType>::value,
                 "BitsType must be an unsigned integer");
-
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     int n = CHAR_BIT * sizeof(BitsType);
     int shift = n / 2;
@@ -655,9 +688,9 @@ EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 
 template <typename BitsType>
-struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned int)>> {
+struct count_bits_impl<
+    BitsType, std::enable_if_t<std::is_integral<BitsType>::value && sizeof(BitsType) <= sizeof(unsigned int)>> {
   static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
-  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     static constexpr int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
     return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
@@ -669,10 +702,10 @@ struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(uns
 };
 
 template <typename BitsType>
-struct count_bits_impl<
-    BitsType, std::enable_if_t<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>> {
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned int) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(unsigned long)>> {
   static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
-  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     static constexpr int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
     return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
@@ -684,10 +717,10 @@ struct count_bits_impl<
 };
 
 template <typename BitsType>
-struct count_bits_impl<BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) &&
-                                                  sizeof(BitsType) <= sizeof(unsigned long long)>> {
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned long) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(unsigned long long)>> {
   static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
-  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     static constexpr int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
     return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
@@ -701,9 +734,9 @@ struct count_bits_impl<BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof
 #elif EIGEN_COMP_MSVC
 
 template <typename BitsType>
-struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(unsigned long)>> {
+struct count_bits_impl<
+    BitsType, std::enable_if_t<std::is_integral<BitsType>::value && sizeof(BitsType) <= sizeof(unsigned long)>> {
   static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
-  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     unsigned long out;
     _BitScanReverse(&out, static_cast<unsigned long>(bits));
@@ -720,10 +753,10 @@ struct count_bits_impl<BitsType, std::enable_if_t<sizeof(BitsType) <= sizeof(uns
 #ifdef _WIN64
 
 template <typename BitsType>
-struct count_bits_impl<
-    BitsType, std::enable_if_t<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>> {
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned long) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(__int64)>> {
   static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
-  static_assert(std::is_integral<BitsType>::value, "BitsType must be a built-in integer");
   static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
     unsigned long out;
     _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
@@ -742,192 +775,27 @@ struct count_bits_impl<
 #endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 
 template <typename BitsType>
-int log2_ceil(BitsType x) {
-  int n = CHAR_BIT * sizeof(BitsType) - clz(x);
-  bool powerOfTwo = (x & (x - 1)) == 0;
-  return x == 0 ? 0 : powerOfTwo ? n - 1 : n;
-}
-
-template <typename BitsType>
-int log2_floor(BitsType x) {
-  int n = CHAR_BIT * sizeof(BitsType) - clz(x);
-  return x == 0 ? 0 : n - 1;
-}
-
-/****************************************************************************
- * Implementation of random                                               *
- ****************************************************************************/
-
-// return a Scalar filled with numRandomBits beginning from the least significant bit
-template <typename Scalar>
-Scalar getRandomBits(int numRandomBits) {
-  using BitsType = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
-  enum : int {
-    StdRandBits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
-    ScalarBits = sizeof(Scalar) * CHAR_BIT
-  };
-  eigen_assert((numRandomBits >= 0) && (numRandomBits <= ScalarBits));
-  const BitsType mask = BitsType(-1) >> ((ScalarBits - numRandomBits) & (ScalarBits - 1));
-  BitsType randomBits = BitsType(0);
-  for (int shift = 0; shift < numRandomBits; shift += StdRandBits) {
-    int r = std::rand();
-    randomBits |= static_cast<BitsType>(r) << shift;
+struct log_2_impl {
+  static constexpr int kTotalBits = sizeof(BitsType) * CHAR_BIT;
+  static EIGEN_DEVICE_FUNC inline int run_ceil(const BitsType& x) {
+    const int n = kTotalBits - clz(x);
+    bool power_of_two = (x & (x - 1)) == 0;
+    return x == 0 ? 0 : power_of_two ? (n - 1) : n;
   }
-  // clear the excess bits
-  randomBits &= mask;
-  return numext::bit_cast<Scalar, BitsType>(randomBits);
-}
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct random_default_impl {};
-
-template <typename Scalar>
-struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar>
-struct random_retval {
-  typedef Scalar type;
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, false> {
-  using BitsType = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y, int numRandomBits) {
-    Scalar half_x = Scalar(0.5) * x;
-    Scalar half_y = Scalar(0.5) * y;
-    Scalar result = (half_x + half_y) + (half_y - half_x) * run(numRandomBits);
-    // result is in the half-open interval [x, y) -- provided that x < y
-    return result;
-  }
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
-    const int mantissa_bits = NumTraits<Scalar>::digits() - 1;
-    return run(x, y, mantissa_bits);
-  }
-  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
-    const int mantissa_bits = NumTraits<Scalar>::digits() - 1;
-    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissa_bits);
-    BitsType randomBits = getRandomBits<BitsType>(numRandomBits);
-    // if fewer than MantissaBits is requested, shift them to the left
-    randomBits <<= (mantissa_bits - numRandomBits);
-    // randomBits is in the half-open interval [2,4)
-    randomBits |= numext::bit_cast<BitsType>(Scalar(2));
-    // result is in the half-open interval [-1,1)
-    Scalar result = numext::bit_cast<Scalar>(randomBits) - Scalar(3);
-    return result;
-  }
-  static EIGEN_DEVICE_FUNC inline Scalar run() {
-    const int mantissa_bits = NumTraits<Scalar>::digits() - 1;
-    return run(mantissa_bits);
-  }
-};
-
-// TODO: fix this for PPC
-template <bool Specialize = sizeof(long double) == 2 * sizeof(uint64_t) && !EIGEN_ARCH_PPC>
-struct random_longdouble_impl {
-  enum : int {
-    Size = sizeof(long double),
-    MantissaBits = NumTraits<long double>::digits() - 1,
-    LowBits = MantissaBits > 64 ? 64 : MantissaBits,
-    HighBits = MantissaBits > 64 ? MantissaBits - 64 : 0
-  };
-  static EIGEN_DEVICE_FUNC inline long double run() {
-    EIGEN_USING_STD(memcpy)
-    uint64_t randomBits[2];
-    long double result = 2.0L;
-    memcpy(&randomBits, &result, Size);
-    randomBits[0] |= getRandomBits<uint64_t>(LowBits);
-    randomBits[1] |= getRandomBits<uint64_t>(HighBits);
-    memcpy(&result, &randomBits, Size);
-    result -= 3.0L;
-    return result;
-  }
-};
-
-// GPUs treat long double as double.
-#ifndef EIGEN_GPU_COMPILE_PHASE
-template <>
-struct random_longdouble_impl<false> {
-  using Impl = random_impl<double>;
-  static EIGEN_DEVICE_FUNC inline long double run() { return static_cast<long double>(Impl::run()); }
-};
-
-template <>
-struct random_impl<long double> {
-  static EIGEN_DEVICE_FUNC inline long double run(const long double& x, const long double& y) {
-    long double half_x = 0.5L * x;
-    long double half_y = 0.5L * y;
-    long double result = (half_x + half_y) + (half_y - half_x) * run();
-    return result;
-  }
-  static EIGEN_DEVICE_FUNC inline long double run() { return random_longdouble_impl<>::run(); }
-};
-#endif
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, true> {
-  using BitsType = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
-  enum : int { ScalarBits = sizeof(Scalar) * CHAR_BIT };
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
-    if (y <= x) return x;
-    const BitsType range = static_cast<BitsType>(y) - static_cast<BitsType>(x) + 1;
-    // handle edge case where [x,y] spans the entire range of Scalar
-    if (range == 0) return getRandomBits<Scalar>(ScalarBits);
-    // calculate the number of random bits needed to fill range
-    const int numRandomBits = log2_ceil(range);
-    BitsType randomBits;
-    do {
-      randomBits = getRandomBits<BitsType>(numRandomBits);
-      // if the random draw is outside [0, range), try again (rejection sampling)
-      // in the worst-case scenario, the probability of rejection is: 1/2 - 1/2^numRandomBits < 50%
-    } while (randomBits >= range);
-    // Avoid overflow in the case where `x` is negative and there is a large range so
-    // `randomBits` would also be negative if cast to `Scalar` first.
-    Scalar result = static_cast<Scalar>(static_cast<BitsType>(x) + randomBits);
-    return result;
-  }
-
-  static EIGEN_DEVICE_FUNC inline Scalar run() {
-#ifdef EIGEN_MAKING_DOCS
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
-#else
-    return getRandomBits<Scalar>(ScalarBits);
-#endif
+  static EIGEN_DEVICE_FUNC inline int run_floor(const BitsType& x) {
+    const int n = kTotalBits - clz(x);
+    return x == 0 ? 0 : n - 1;
   }
 };
 
-template <>
-struct random_impl<bool> {
-  static EIGEN_DEVICE_FUNC inline bool run(const bool& x, const bool& y) {
-    if (y <= x) return x;
-    return run();
-  }
-  static EIGEN_DEVICE_FUNC inline bool run() { return getRandomBits<int>(1) ? true : false; }
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, true, false> {
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
-    return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
-  }
-  static EIGEN_DEVICE_FUNC inline Scalar run() {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return Scalar(random<RealScalar>(), random<RealScalar>());
-  }
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+template <typename BitsType>
+int log2_ceil(const BitsType& x) {
+  return log_2_impl<BitsType>::run_ceil(x);
 }
 
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+template <typename BitsType>
+int log2_floor(const BitsType& x) {
+  return log_2_impl<BitsType>::run_floor(x);
 }
 
 // Implementation of is* functions
@@ -964,8 +832,8 @@ EIGEN_DEVICE_FUNC std::enable_if_t<(std::numeric_limits<T>::has_infinity && !Num
 
 template <typename T>
 EIGEN_DEVICE_FUNC
-    std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
-    isnan_impl(const T&) {
+std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+isnan_impl(const T&) {
   return false;
 }
 
@@ -1012,7 +880,7 @@ struct sign_impl<Scalar, true, IsInteger> {
     real_type aa = abs(a);
     if (aa == real_type(0)) return Scalar(0);
     aa = real_type(1) / aa;
-    return Scalar(a.real() * aa, a.imag() * aa);
+    return Scalar(numext::real(a) * aa, numext::imag(a) * aa);
   }
 };
 
@@ -1027,6 +895,25 @@ struct sign_retval {
   typedef Scalar type;
 };
 
+// suppress "unary minus operator applied to unsigned type, result still unsigned" warnings on MSVC
+// note: `0 - a` is distinct from `-a` when Scalar is a floating point type and `a` is zero
+
+template <typename Scalar, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct negate_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar run(const Scalar& a) { return -a; }
+};
+
+template <typename Scalar>
+struct negate_impl<Scalar, true> {
+  EIGEN_STATIC_ASSERT((!is_same<Scalar, bool>::value), NEGATE IS NOT DEFINED FOR BOOLEAN TYPES)
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar run(const Scalar& a) { return Scalar(0) - a; }
+};
+
+template <typename Scalar>
+struct negate_retval {
+  typedef Scalar type;
+};
+
 template <typename Scalar, bool IsInteger = NumTraits<typename unpacket_traits<Scalar>::type>::IsInteger>
 struct nearest_integer_impl {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_floor(const Scalar& x) {
@@ -1041,6 +928,9 @@ struct nearest_integer_impl {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_round(const Scalar& x) {
     EIGEN_USING_STD(round) return round(x);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) {
+    EIGEN_USING_STD(trunc) return trunc(x);
+  }
 };
 template <typename Scalar>
 struct nearest_integer_impl<Scalar, true> {
@@ -1048,8 +938,82 @@ struct nearest_integer_impl<Scalar, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_ceil(const Scalar& x) { return x; }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_rint(const Scalar& x) { return x; }
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_round(const Scalar& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; }
 };
 
+// Extra namespace to prevent leaking std::fma into Eigen::internal.
+namespace has_fma_detail {
+
+template <typename T, typename EnableIf = void>
+struct has_fma_impl : public std::false_type {};
+
+using std::fma;
+
+template <typename T>
+struct has_fma_impl<
+    T, std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>>
+    : public std::true_type {};
+
+}  // namespace has_fma_detail
+
+template <typename T>
+struct has_fma : public has_fma_detail::has_fma_impl<T> {};
+
+// Default implementation.
+template <typename T, typename Enable = void>
+struct fma_impl {
+  static_assert(has_fma<T>::value, "No function fma(...) for type.  Please provide an implementation.");
+};
+
+// STD or ADL version if it exists.
+template <typename T>
+struct fma_impl<T, std::enable_if_t<has_fma<T>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T run(const T& a, const T& b, const T& c) {
+    using std::fma;
+    return fma(a, b, c);
+  }
+};
+
+#if defined(EIGEN_GPUCC)
+template <>
+struct has_fma<float> : public true_type {};
+
+template <>
+struct fma_impl<float, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) {
+    return ::fmaf(a, b, c);
+  }
+};
+
+template <>
+struct has_fma<double> : public true_type {};
+
+template <>
+struct fma_impl<double, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) {
+    return ::fma(a, b, c);
+  }
+};
+#endif
+
+// Basic multiply-add.
+template <typename Scalar, typename EnableIf = void>
+struct madd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return x * y + z;
+  }
+};
+
+// Use FMA if there is a single CPU instruction.
+#ifdef EIGEN_VECTORIZE_FMA
+template <typename Scalar>
+struct madd_impl<Scalar, std::enable_if_t<has_fma<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return fma_impl<Scalar>::run(x, y, z);
+  }
+};
+#endif
+
 }  // end namespace internal
 
 /****************************************************************************
@@ -1185,27 +1149,6 @@ SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
 
 #endif
 
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline internal::add_const_on_value_type_t<EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)> real_ref(
-    const Scalar& x) {
-  return internal::real_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
-}
-
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
@@ -1232,6 +1175,11 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(sign, Scalar) sign(const Scalar&
   return EIGEN_MATHFUNC_IMPL(sign, Scalar)::run(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(negate, Scalar) negate(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(negate, Scalar)::run(x);
+}
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
@@ -1334,17 +1282,26 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar round(const Scalar& x) {
   return internal::nearest_integer_impl<Scalar>::run_round(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
 template <typename Scalar>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(floor)(const Scalar& x) {
   return internal::nearest_integer_impl<Scalar>::run_floor(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(ceil)(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_ceil(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(trunc)(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_trunc(x);
+}
+
 #if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(trunc, trunc)
 #endif
 
 #if defined(EIGEN_GPUCC)
@@ -1352,52 +1309,61 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
   return ::floorf(x);
 }
-
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
   return ::floor(x);
 }
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(ceil)(const Scalar& x) {
-  return internal::nearest_integer_impl<Scalar>::run_ceil(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
   return ::ceilf(x);
 }
-
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
   return ::ceil(x);
 }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float trunc(const float& x) {
+  return ::truncf(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double trunc(const double& x) {
+  return ::trunc(x);
+}
 #endif
 
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
+  using UnsignedT = typename internal::make_unsigned<T>::type;
   EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
-  eigen_assert(a >= 0);
-  eigen_assert(b > 0);
+  // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
+  const UnsignedT ua = UnsignedT(a);
+  const UnsignedT ub = UnsignedT(b);
   // Note: This form is used because it cannot overflow.
-  return a == 0 ? 0 : (a - 1) / b + 1;
+  return ua == 0 ? 0 : (ua - 1) / ub + 1;
+}
+
+// Integer round down to nearest power of b
+// T is assumed to be an integer type with a>=0, and b>0
+template <typename T, typename U>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
+  using UnsignedT = typename internal::make_unsigned<T>::type;
+  using UnsignedU = typename internal::make_unsigned<U>::type;
+  EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  EIGEN_STATIC_ASSERT((NumTraits<U>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
+  const UnsignedT ua = UnsignedT(a);
+  const UnsignedU ub = UnsignedU(b);
+  return ub * (ua / ub);
 }
 
 /** Log base 2 for 32 bits positive integers.
  * Conveniently returns 0 for x==0. */
-inline int log2(int x) {
-  eigen_assert(x >= 0);
+constexpr int log2(int x) {
   unsigned int v(x);
-  static const int table[32] = {0, 9,  1,  10, 13, 21, 2,  29, 11, 14, 16, 18, 22, 25, 3, 30,
-                                8, 12, 20, 28, 15, 17, 24, 7,  19, 27, 23, 6,  26, 5,  4, 31};
+  constexpr int table[32] = {0, 9,  1,  10, 13, 21, 2,  29, 11, 14, 16, 18, 22, 25, 3, 30,
+                             8, 12, 20, 28, 15, 17, 24, 7,  19, 27, 23, 6,  26, 5,  4, 31};
   v |= v >> 1;
   v |= v >> 2;
   v |= v >> 4;
@@ -1432,11 +1398,17 @@ SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 
 /** \returns the cube root of \a x. **/
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cbrt(const T& x) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!NumTraits<T>::IsComplex, T> cbrt(const T& x) {
   EIGEN_USING_STD(cbrt);
   return static_cast<T>(cbrt(x));
 }
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsComplex, T> cbrt(const T& x) {
+  EIGEN_USING_STD(pow);
+  return pow(x, typename NumTraits<T>::Real(1.0 / 3.0));
+}
+
 /** \returns the reciprocal square root of \a x. **/
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T rsqrt(const T& x) {
@@ -1465,17 +1437,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
 #endif
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
-    abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
+abs(const T& x) {
   EIGEN_USING_STD(abs);
   return abs(x);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
-    abs(const T& x) {
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+abs(const T& x) {
   return x;
 }
 
@@ -1592,6 +1564,63 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(const std::comple
 }
 #endif
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp2(const T& x) {
+  EIGEN_USING_STD(exp2);
+  return exp2(x);
+}
+
+// MSVC screws up some edge-cases for std::exp2(complex).
+#ifdef EIGEN_COMP_MSVC
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<RealScalar> exp2(const std::complex<RealScalar>& x) {
+  EIGEN_USING_STD(exp);
+  // If z is (x,±∞) (for any finite x), the result is (NaN,NaN) and FE_INVALID is raised.
+  // If z is (x,NaN) (for any finite x), the result is (NaN,NaN) and FE_INVALID may be raised.
+  if ((isfinite)(real_ref(x)) && !(isfinite)(imag_ref(x))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::quiet_NaN(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  // If z is (+∞,±∞), the result is (±∞,NaN) and FE_INVALID is raised (the sign of the real part is unspecified)
+  // If z is (+∞,NaN), the result is (±∞,NaN) (the sign of the real part is unspecified)
+  if ((real_ref(x) == NumTraits<RealScalar>::infinity() && !(isfinite)(imag_ref(x)))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::infinity(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  return exp2(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp2, exp2)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp2(const float& x) {
+  return ::exp2f(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp2(const double& x) {
+  return ::exp2(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp2(const std::complex<float>& x) {
+  float com = ::exp2f(x.real());
+  float res_real = com * ::cosf(static_cast<float>(EIGEN_LN2) * x.imag());
+  float res_imag = com * ::sinf(static_cast<float>(EIGEN_LN2) * x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp2(const std::complex<double>& x) {
+  double com = ::exp2(x.real());
+  double res_real = com * ::cos(static_cast<double>(EIGEN_LN2) * x.imag());
+  double res_imag = com * ::sin(static_cast<double>(EIGEN_LN2) * x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
@@ -1881,6 +1910,35 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a, const double&
 #undef SYCL_SPECIALIZE_BINARY_FUNC
 #endif
 
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar logical_shift_left(const Scalar& a, int n) {
+  return a << n;
+}
+
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar logical_shift_right(const Scalar& a, int n) {
+  using UnsignedScalar = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
+  return bit_cast<Scalar, UnsignedScalar>(bit_cast<UnsignedScalar, Scalar>(a) >> n);
+}
+
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar arithmetic_shift_right(const Scalar& a, int n) {
+  using SignedScalar = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
+  return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
+}
+
+// Otherwise, rely on template implementation.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::fma_impl<Scalar>::run(x, y, z);
+}
+
+// Multiply-add.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar madd(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::madd_impl<Scalar>::run(x, y, z);
+}
+
 }  // end namespace numext
 
 namespace internal {
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 689c6d8..c4b5da3 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -23,12 +23,12 @@ namespace internal {
  Preconditions:
    1. The starting guess provided in approx_a_recip must have at least half
       the leading mantissa bits in the correct result, such that a single
-      Newton-Raphson step is sufficient to get within 1-2 ulps of the currect
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the correct
       result.
    2. If a is zero, approx_a_recip must be infinite with the same sign as a.
    3. If a is infinite, approx_a_recip must be zero with the same sign as a.
 
-   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles reciprocals of zero, infinity, and NaN.
 */
@@ -61,12 +61,12 @@ struct generic_reciprocal_newton_step<Packet, 0> {
  Preconditions:
    1. The starting guess provided in approx_a_recip must have at least half
       the leading mantissa bits in the correct result, such that a single
-      Newton-Raphson step is sufficient to get within 1-2 ulps of the currect
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the correct
       result.
    2. If a is zero, approx_a_recip must be infinite with the same sign as a.
    3. If a is infinite, approx_a_recip must be zero with the same sign as a.
 
-   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles zero, infinity, and NaN. Positive denormals are
    treated as zero.
@@ -76,7 +76,7 @@ struct generic_rsqrt_newton_step {
   static_assert(Steps > 0, "Steps must be at least 1.");
   using Scalar = typename unpacket_traits<Packet>::type;
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) {
-    constexpr Scalar kMinusHalf = Scalar(-1) / Scalar(2);
+    const Scalar kMinusHalf = Scalar(-1) / Scalar(2);
     const Packet cst_minus_half = pset1<Packet>(kMinusHalf);
     const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
 
@@ -112,11 +112,11 @@ struct generic_rsqrt_newton_step<Packet, 0> {
    1. The starting guess for the reciprocal sqrt provided in approx_rsqrt must
       have at least half the leading mantissa bits in the correct result, such
       that a single Newton-Raphson step is sufficient to get within 1-2 ulps of
-      the currect result.
+      the correct result.
    2. If a is zero, approx_rsqrt must be infinite.
    3. If a is infinite, approx_rsqrt must be zero.
 
-   If the preconditions are satisfied, which they are for for the _*_rsqrt_ps
+   If the preconditions are satisfied, which they are for the _*_rsqrt_ps
    instructions on x86, the result has a maximum relative error of 2 ulps,
    and correctly handles zero and infinity, and NaN. Positive denormal inputs
    are treated as zero.
@@ -171,8 +171,8 @@ struct hypot_impl {
 
 // Generic complex sqrt implementation that correctly handles corner cases
 // according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& z) {
   // Computes the principal sqrt of the input.
   //
   // For a complex square root of the number x + i*y. We want to find real
@@ -194,21 +194,21 @@ EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
   //   if x == 0: u = w, v = sign(y) * w
   //   if x > 0:  u = w, v = y / (2 * w)
   //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
-
+  using T = typename NumTraits<ComplexT>::Real;
   const T x = numext::real(z);
   const T y = numext::imag(z);
   const T zero = T(0);
   const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
 
-  return (numext::isinf)(y)           ? std::complex<T>(NumTraits<T>::infinity(), y)
-         : numext::is_exactly_zero(x) ? std::complex<T>(w, y < zero ? -w : w)
-         : x > zero                   ? std::complex<T>(w, y / (2 * w))
-                                      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w);
+  return (numext::isinf)(y)           ? ComplexT(NumTraits<T>::infinity(), y)
+         : numext::is_exactly_zero(x) ? ComplexT(w, y < zero ? -w : w)
+         : x > zero                   ? ComplexT(w, y / (2 * w))
+                                      : ComplexT(numext::abs(y) / (2 * w), y < zero ? -w : w);
 }
 
 // Generic complex rsqrt implementation.
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& z) {
   // Computes the principal reciprocal sqrt of the input.
   //
   // For a complex reciprocal square root of the number z = x + i*y. We want to
@@ -230,7 +230,7 @@ EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
   //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
   //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
   //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
-
+  using T = typename NumTraits<ComplexT>::Real;
   const T x = numext::real(z);
   const T y = numext::imag(z);
   const T zero = T(0);
@@ -239,20 +239,21 @@ EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
   const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
   const T woz = w / abs_z;
   // Corner cases consistent with 1/sqrt(z) on gcc/clang.
-  return numext::is_exactly_zero(abs_z) ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
-         : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
-         : numext::is_exactly_zero(x)                 ? std::complex<T>(woz, y < zero ? woz : -woz)
-         : x > zero                                   ? std::complex<T>(woz, -y / (2 * w * abs_z))
-                    : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz);
+  return numext::is_exactly_zero(abs_z)               ? ComplexT(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+         : ((numext::isinf)(x) || (numext::isinf)(y)) ? ComplexT(zero, zero)
+         : numext::is_exactly_zero(x)                 ? ComplexT(woz, y < zero ? woz : -woz)
+         : x > zero                                   ? ComplexT(woz, -y / (2 * w * abs_z))
+                    : ComplexT(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz);
 }
 
-template <typename T>
-EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
   // Computes complex log.
+  using T = typename NumTraits<ComplexT>::Real;
   T a = numext::abs(z);
   EIGEN_USING_STD(atan2);
   T b = atan2(z.imag(), z.real());
-  return std::complex<T>(numext::log(a), b);
+  return ComplexT(numext::log(a), b);
 }
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index af6afaf..a2c8eba 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -125,7 +125,7 @@ struct traits<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
  * coefficients.</dd>
  *
  * <dt><b>\anchor fixedsize Fixed-size versus dynamic-size:</b></dt>
- * <dd>Fixed-size means that the numbers of rows and columns are known are compile-time. In this case, Eigen allocates
+ * <dd>Fixed-size means that the numbers of rows and columns are known at compile-time. In this case, Eigen allocates
  * the array of coefficients as a fixed-size array, as a class member. This makes sense for very small matrices,
  * typically up to 4x4, sometimes up to 16x16. Larger matrices should be declared as dynamic-size even if one happens to
  * know their size at compile-time.
@@ -139,7 +139,7 @@ struct traits<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
  * <dt><b>\anchor maxrows MaxRows_ and MaxCols_:</b></dt>
  * <dd>In most cases, one just leaves these parameters to the default values.
  * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases
- * when the exact numbers of rows and columns are not known are compile-time, but it is known at compile-time that they
+ * when the exact numbers of rows and columns are not known at compile-time, but it is known at compile-time that they
  * cannot exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case
  * MaxRows_ and MaxCols_ are the dimensions of the original matrix, while Rows_ and Cols_ are Dynamic.</dd>
  * </dl>
@@ -224,8 +224,6 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
     return Base::_set(other);
   }
 
-  /* Here, doxygen failed to copy the brief information when using \copydoc */
-
   /**
    * \brief Copies the generic expression \a other into *this.
    * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
@@ -250,24 +248,31 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix()
-      : Base(){EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED}
-
-        // FIXME is it still needed
-        EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit Matrix(
-            internal::constructor_without_unaligned_array_assert)
-      : Base(internal::constructor_without_unaligned_array_assert()){EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED}
-
-        EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(Matrix && other)
-            EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
-      : Base(std::move(other)) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+#if defined(EIGEN_INITIALIZE_COEFFS)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() = default;
+#endif
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(Matrix&&) = default;
+  /** \brief Moves the matrix into the other one.
+   *
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+      std::is_nothrow_move_assignable<Scalar>::value) {
     Base::operator=(std::move(other));
     return *this;
   }
 
-  /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+  /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
+   *
+   * \only_for_vectors
+   *
+   * This constructor is for 1D array or vectors with more than 4 coefficients.
+   *
+   * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   *
    *
    * Example: \include Matrix_variadic_ctor_cxx11.cpp
    * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
@@ -281,6 +286,7 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
 
   /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row.
    * \cpp11
+   * \anchor matrix_initializer_list
    *
    * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
    *
@@ -379,7 +385,7 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
   }
 
   /** \brief Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(const Matrix&) = default;
 
   /** \brief Copy constructor for generic expressions.
    * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
@@ -387,8 +393,8 @@ class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, Ma
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
 
   /////////// Geometry module ///////////
 
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 81d5a97..045993d 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -112,7 +112,7 @@ class MatrixBase : public DenseBase<Derived> {
                              ConstTransposeReturnType>
       AdjointReturnType;
   /** \internal Return type of eigenvalues() */
-  typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor>
+  typedef Matrix<internal::make_complex_t<Scalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor>
       EigenvaluesReturnType;
   /** \internal the return type of identity */
   typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>, PlainObject> IdentityReturnType;
@@ -280,7 +280,7 @@ class MatrixBase : public DenseBase<Derived> {
    * \sa isApprox(), operator!= */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const {
-    return cwiseEqual(other).all();
+    return (this->rows() == other.rows()) && (this->cols() == other.cols()) && cwiseEqual(other).all();
   }
 
   /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -289,7 +289,7 @@ class MatrixBase : public DenseBase<Derived> {
    * \sa isApprox(), operator== */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const {
-    return cwiseNotEqual(other).any();
+    return !(*this == other);
   }
 
   NoAlias<Derived, Eigen::MatrixBase> EIGEN_DEVICE_FUNC noalias();
@@ -373,12 +373,14 @@ class MatrixBase : public DenseBase<Derived> {
   template <int Options = 0>
   inline JacobiSVD<PlainObject, Options> jacobiSvd() const;
   template <int Options = 0>
-  EIGEN_DEPRECATED inline JacobiSVD<PlainObject, Options> jacobiSvd(unsigned int computationOptions) const;
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline JacobiSVD<PlainObject, Options> jacobiSvd(unsigned int computationOptions) const;
 
   template <int Options = 0>
   inline BDCSVD<PlainObject, Options> bdcSvd() const;
   template <int Options = 0>
-  EIGEN_DEPRECATED inline BDCSVD<PlainObject, Options> bdcSvd(unsigned int computationOptions) const;
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline BDCSVD<PlainObject, Options> bdcSvd(unsigned int computationOptions) const;
 
   /////////// Geometry module ///////////
 
@@ -391,7 +393,8 @@ class MatrixBase : public DenseBase<Derived> {
 
   EIGEN_DEVICE_FUNC inline PlainObject unitOrthogonal(void) const;
 
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> eulerAngles(Index a0, Index a1, Index a2) const;
+  EIGEN_DEPRECATED_WITH_REASON("Use .canonicalEulerAngles() instead.")
+  EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> eulerAngles(Index a0, Index a1, Index a2) const;
 
   EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> canonicalEulerAngles(Index a0, Index a1, Index a2) const;
 
@@ -468,7 +471,7 @@ class MatrixBase : public DenseBase<Derived> {
   EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
   EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
   EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p)
-  EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
+  EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const internal::make_complex_t<Scalar>& p)
 
  protected:
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index ec360eb..2ce83a8 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -45,8 +45,8 @@ class NestByValue : public internal::dense_xpr_base<NestByValue<ExpressionType>
 
   EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
 
   EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 2848b78..bf41c3b 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -22,13 +22,13 @@ namespace internal {
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits; }
 };
 
 template <typename T>
 struct default_digits_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::ceil;
     using std::log2;
     typedef typename NumTraits<T>::Real Real;
@@ -39,7 +39,7 @@ struct default_digits_impl<T, false, false>  // Floating point
 template <typename T>
 struct default_digits_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 // default implementation of digits10(), based on numeric_limits if specialized,
@@ -47,13 +47,13 @@ struct default_digits_impl<T, false, true>  // Integer
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::digits10; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template <typename T>
 struct default_digits10_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::floor;
     using std::log10;
     typedef typename NumTraits<T>::Real Real;
@@ -64,7 +64,7 @@ struct default_digits10_impl<T, false, false>  // Floating point
 template <typename T>
 struct default_digits10_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 // default implementation of max_digits10(), based on numeric_limits if specialized,
@@ -72,13 +72,13 @@ struct default_digits10_impl<T, false, true>  // Integer
 template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_max_digits10_impl {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits<T>::max_digits10; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::max_digits10; }
 };
 
 template <typename T>
 struct default_max_digits10_impl<T, false, false>  // Floating point
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() {
+  EIGEN_DEVICE_FUNC constexpr static int run() {
     using std::ceil;
     using std::log10;
     typedef typename NumTraits<T>::Real Real;
@@ -89,22 +89,35 @@ struct default_max_digits10_impl<T, false, false>  // Floating point
 template <typename T>
 struct default_max_digits10_impl<T, false, true>  // Integer
 {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; }
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
 };
 
 }  // end namespace internal
 
 namespace numext {
-/** \internal bit-wise cast without changing the underlying bit representation. */
 
-// TODO: Replace by std::bit_cast (available in C++20)
+/** \internal bit-wise cast without changing the underlying bit representation. */
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  return std::bit_cast<Tgt>(src);
+}
+#elif EIGEN_HAS_BUILTIN(__builtin_bit_cast)
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)
+  return __builtin_bit_cast(Tgt, src);
+}
+#else
 template <typename Tgt, typename Src>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
   // The behaviour of memcpy is not specified for non-trivially copyable types
-  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
   EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
-                      THIS_TYPE_IS_NOT_SUPPORTED);
-  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
+                      THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)
 
   Tgt tgt;
   // Load src into registers first. This allows the memcpy to be elided by CUDA.
@@ -113,8 +126,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
   memcpy(static_cast<void*>(&tgt), static_cast<const void*>(&staged), sizeof(Tgt));
   return tgt;
 }
+#endif
 }  // namespace numext
 
+// clang-format off
 /** \class NumTraits
  * \ingroup Core_Module
  *
@@ -126,48 +141,50 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
  *
  * The provided data consists of:
  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
- *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+ *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
  *     is a typedef to \a U.
  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
- * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c
- * std::complex<U>, Literal is defined as \c U. Of course, this type must be fully compatible with \a T. In doubt, just
- * use \a T here. \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you
- * don't know what this means, just use \a T here. \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c
- * std::complex type, and to 0 otherwise. \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type
- * such as \c int, and to \c 0 otherwise. \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of
- * the number of CPU cycles needed to by move / add / mul instructions respectively, assuming the data is already stored
- * in CPU registers. Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just
- * use \c Eigen::HugeCost. \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T
- * is unsigned. \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type
- * \a T must be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1
- * otherwise. \li An epsilon() function which, unlike <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>, it returns a
- * \a Real instead of a \a T. \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a
- * default value by the fuzzy comparison operators. \li highest() and lowest() functions returning the highest and
- * lowest possible values respectively. \li digits() function returning the number of radix digits (non-sign digits for
- * integers, mantissa for floating-point). This is the analogue of <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a> which is used
- * as the default implementation if specialized. \li digits10() function returning the number of decimal digits that can
- * be represented without change. This is the analogue of <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a> which is
- * used as the default implementation if specialized. \li max_digits10() function returning the number of decimal digits
- * required to uniquely represent all distinct values of the type. This is the analogue of <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_digits10">std::numeric_limits<T>::max_digits10</a>
+ * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for
+ *     `std::complex<U>`, Literal is defined as \a U. Of course, this type must be fully compatible with \a T. In doubt,
+ *     just use \a T here.
+ * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+ *     this means, just use \a T here.
+ * \li An enum value \c IsComplex. It is equal to 1 if \a T is a \c std::complex type, and to 0 otherwise.
+ * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int, and to \c 0 otherwise.
+ * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed to by
+ *     move / add / mul instructions respectively, assuming the data is already stored in CPU registers. Stay vague here.
+ *     No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
+ * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+ * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must be
+ *     called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
+ * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">
+ *     `std::numeric_limits::epsilon()`</a>, it returns a \c Real instead of a \a T.
+ * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default value by the fuzzy
+ *     comparison operators.
+ * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+ * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point).
+ *     This is the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">
+ *     `std::numeric_limits<T>::digits`</a> which is used as the default implementation if specialized.
+ * \li digits10() function returning the number of decimal digits that can be represented without change. This is the
+ *     analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">
+ *     `std::numeric_limits<T>::digits10`</a> which is used as the default implementation if specialized.
+ * \li max_digits10() function returning the number of decimal digits required to uniquely represent all distinct values
+ *     of the type. This is the analogue of <a
+ *     href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_digits10">`std::numeric_limits<T>::max_digits10`</a>
  *     which is used as the default implementation if specialized.
  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent
- * to <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
- *     <a
- * href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+ *     to <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">
+ *     `std::numeric_limits<T>::min_exponent`</a>/<a
+ *     href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">`std::numeric_limits<T>::max_exponent`</a>.
  * \li infinity() function returning a representation of positive infinity, if available.
- * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
+ * \li quiet_NaN() function returning a non-signaling "not-a-number", if available.
  */
-
+// clang-format on
 template <typename T>
 struct GenericNumTraits {
   enum {
@@ -185,32 +202,30 @@ struct GenericNumTraits {
   typedef T Nested;
   typedef T Literal;
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl<T>::run(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return internal::default_digits10_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() {
-    return internal::default_max_digits10_impl<T>::run();
-  }
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return internal::default_max_digits10_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits() { return internal::default_digits_impl<T>::run(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits() { return internal::default_digits_impl<T>::run(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
+  EIGEN_DEVICE_FUNC constexpr static int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
+  EIGEN_DEVICE_FUNC constexpr static int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() {
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() {
     // make sure to override this for floating-point types
     return Real(0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits<T>::max)(); }
+  EIGEN_DEVICE_FUNC constexpr static T highest() { return (numext::numeric_limits<T>::max)(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { return (numext::numeric_limits<T>::lowest)(); }
+  EIGEN_DEVICE_FUNC constexpr static T lowest() { return (numext::numeric_limits<T>::lowest)(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits<T>::infinity(); }
+  EIGEN_DEVICE_FUNC constexpr static T infinity() { return numext::numeric_limits<T>::infinity(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
+  EIGEN_DEVICE_FUNC constexpr static T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
 };
 
 template <typename T>
@@ -218,25 +233,23 @@ struct NumTraits : GenericNumTraits<T> {};
 
 template <>
 struct NumTraits<float> : GenericNumTraits<float> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; }
+  EIGEN_DEVICE_FUNC constexpr static float dummy_precision() { return 1e-5f; }
 };
 
 template <>
 struct NumTraits<double> : GenericNumTraits<double> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; }
+  EIGEN_DEVICE_FUNC constexpr static double dummy_precision() { return 1e-12; }
 };
 
 // GPU devices treat `long double` as `double`.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template <>
 struct NumTraits<long double> : GenericNumTraits<long double> {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double dummy_precision() {
-    return static_cast<long double>(1e-15l);
-  }
+  EIGEN_DEVICE_FUNC constexpr static long double dummy_precision() { return static_cast<long double>(1e-15l); }
 
 #if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
   // PowerPC double double causes issues with some values
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline long double epsilon() {
+  EIGEN_DEVICE_FUNC constexpr static long double epsilon() {
     // 2^(-(__LDBL_MANT_DIG__)+1)
     return static_cast<long double>(2.4651903288156618919116517665087e-32l);
   }
@@ -250,16 +263,17 @@ struct NumTraits<std::complex<Real_> > : GenericNumTraits<std::complex<Real_> >
   typedef typename NumTraits<Real_>::Literal Literal;
   enum {
     IsComplex = 1,
+    IsSigned = NumTraits<Real_>::IsSigned,
     RequireInitialization = NumTraits<Real_>::RequireInitialization,
     ReadCost = 2 * NumTraits<Real_>::ReadCost,
     AddCost = 2 * NumTraits<Real>::AddCost,
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits<Real>::digits10(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int max_digits10() { return NumTraits<Real>::max_digits10(); }
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return NumTraits<Real>::digits10(); }
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return NumTraits<Real>::max_digits10(); }
 };
 
 template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -286,25 +300,19 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > {
                                                       : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() {
-    return NumTraits<RealScalar>::dummy_precision();
-  }
+  EIGEN_DEVICE_FUNC constexpr static RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 
-  EIGEN_CONSTEXPR
-  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
-  EIGEN_CONSTEXPR
-  static inline int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
+  constexpr static int digits10() { return NumTraits<Scalar>::digits10(); }
+  constexpr static int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
 };
 
 template <>
 struct NumTraits<std::string> : GenericNumTraits<std::string> {
   enum { RequireInitialization = 1, ReadCost = HugeCost, AddCost = HugeCost, MulCost = HugeCost };
 
-  EIGEN_CONSTEXPR
-  static inline int digits10() { return 0; }
-  EIGEN_CONSTEXPR
-  static inline int max_digits10() { return 0; }
+  constexpr static int digits10() { return 0; }
+  constexpr static int max_digits10() { return 0; }
 
  private:
   static inline std::string epsilon();
diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
index 7b2c8dc..1f638f9 100644
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -103,19 +103,36 @@ struct packetwise_redux_impl<Func, Evaluator, NoUnrolling> {
   EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) {
     if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
 
-    const Index size4 = (size - 1) & (~3);
+    const Index size4 = 1 + numext::round_down(size - 1, 4);
     PacketType p = eval.template packetByOuterInner<Unaligned, PacketType>(0, 0);
-    Index i = 1;
     // This loop is optimized for instruction pipelining:
     // - each iteration generates two independent instructions
     // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
-    for (; i < size4; i += 4)
+    for (Index i = 1; i < size4; i += 4)
       p = func.packetOp(
           p, func.packetOp(func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 0, 0),
                                          eval.template packetByOuterInner<Unaligned, PacketType>(i + 1, 0)),
                            func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 2, 0),
                                          eval.template packetByOuterInner<Unaligned, PacketType>(i + 3, 0))));
-    for (; i < size; ++i) p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+    for (Index i = size4; i < size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+    return p;
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct packetwise_segment_redux_impl {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin,
+                                          Index count) {
+    if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
+
+    PacketType p = eval.template packetSegmentByOuterInner<Unaligned, PacketType>(0, 0, begin, count);
+    for (Index i = 1; i < size; ++i)
+      p = func.packetOp(p, eval.template packetSegmentByOuterInner<Unaligned, PacketType>(i, 0, begin, count));
     return p;
   }
 };
@@ -174,14 +191,13 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
 
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const {
-    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
-    typedef Block<const ArgTypeNestedCleaned, Direction == Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
-                  Direction == Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime), true /* InnerPanel */>
-        PanelType;
-
-    PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0,
-                    Direction == Vertical ? m_arg.rows() : Index(PacketSize),
-                    Direction == Vertical ? Index(PacketSize) : m_arg.cols());
+    static constexpr int PacketSize = internal::unpacket_traits<PacketType>::size;
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize;
+    static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>;
 
     // FIXME
     // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of
@@ -189,11 +205,39 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     // by pass "vectorization" in this case:
     if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
 
-    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize;
+    Index numCols = Direction == Vertical ? PacketSize : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
+    PanelEvaluator panel_eval(panel);
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+    return p;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(Direction == Vertical ? j : i, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const {
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic;
+    static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_segment_redux_impl<BinaryOp, PanelEvaluator>;
+
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : begin + count;
+    Index numCols = Direction == Vertical ? begin + count : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
     PanelEvaluator panel_eval(panel);
-    typedef typename MemberOp::BinaryOp BinaryOp;
-    PacketType p = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>::template run<PacketType>(
-        panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count);
     return p;
   }
 
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index 6945964..eb8e797 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -170,7 +170,7 @@ class PermutationBase : public EigenBase<Derived> {
    * \note \blank \note_try_to_help_rvo
    */
   inline InverseReturnType inverse() const { return InverseReturnType(derived()); }
-  /** \returns the tranpose permutation matrix.
+  /** \returns the transpose permutation matrix.
    *
    * \note \blank \note_try_to_help_rvo
    */
@@ -468,17 +468,17 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<IndicesType
 /** \returns the matrix with the permutation applied to the columns.
  */
 template <typename MatrixDerived, typename PermutationDerived>
-EIGEN_DEVICE_FUNC const Product<MatrixDerived, PermutationDerived, AliasFreeProduct> operator*(
+EIGEN_DEVICE_FUNC const Product<MatrixDerived, PermutationDerived, DefaultProduct> operator*(
     const MatrixBase<MatrixDerived>& matrix, const PermutationBase<PermutationDerived>& permutation) {
-  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>(matrix.derived(), permutation.derived());
+  return Product<MatrixDerived, PermutationDerived, DefaultProduct>(matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
  */
 template <typename PermutationDerived, typename MatrixDerived>
-EIGEN_DEVICE_FUNC const Product<PermutationDerived, MatrixDerived, AliasFreeProduct> operator*(
+EIGEN_DEVICE_FUNC const Product<PermutationDerived, MatrixDerived, DefaultProduct> operator*(
     const PermutationBase<PermutationDerived>& permutation, const MatrixBase<MatrixDerived>& matrix) {
-  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>(permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, DefaultProduct>(permutation.derived(), matrix.derived());
 }
 
 template <typename PermutationType>
@@ -520,16 +520,16 @@ class InverseImpl<PermutationType, PermutationStorage> : public EigenBase<Invers
   /** \returns the matrix with the inverse permutation applied to the columns.
    */
   template <typename OtherDerived>
-  friend const Product<OtherDerived, InverseType, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix,
-                                                                              const InverseType& trPerm) {
-    return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
+  friend const Product<OtherDerived, InverseType, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix,
+                                                                            const InverseType& trPerm) {
+    return Product<OtherDerived, InverseType, DefaultProduct>(matrix.derived(), trPerm.derived());
   }
 
   /** \returns the matrix with the inverse permutation applied to the rows.
    */
   template <typename OtherDerived>
-  const Product<InverseType, OtherDerived, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
-    return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
+  const Product<InverseType, OtherDerived, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
+    return Product<InverseType, OtherDerived, DefaultProduct>(derived(), matrix.derived());
   }
 };
 
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 5f846a0..a78305e 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -80,27 +80,6 @@ struct matrix_swap_impl;
 
 }  // end namespace internal
 
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace doxygen {
-
-// This is a workaround to doxygen not being able to understand the inheritance logic
-// when it is hidden by the dense_xpr_base helper struct.
-// Moreover, doxygen fails to include members that are not documented in the declaration body of
-// MatrixBase if we inherits MatrixBase<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_> >,
-// this is why we simply inherits MatrixBase, though this does not make sense.
-
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template <typename Derived>
-struct dense_xpr_base_dispatcher;
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
-struct dense_xpr_base_dispatcher<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> : public MatrixBase {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
-struct dense_xpr_base_dispatcher<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> : public ArrayBase {};
-
-}  // namespace doxygen
-
 /** \class PlainObjectBase
  * \ingroup Core_Module
  * \brief %Dense storage base class for matrices and arrays.
@@ -113,12 +92,7 @@ struct dense_xpr_base_dispatcher<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_
  * \sa \ref TopicClassHierarchy
  */
 template <typename Derived>
-class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
-#else
-template <typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
-#endif
-{
+class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
  public:
   enum { Options = internal::traits<Derived>::Options };
   typedef typename internal::dense_xpr_base<Derived>::type Base;
@@ -188,8 +162,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
   EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
   EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
 
   /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
    * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -270,10 +244,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
   }
 
   /** \returns a const pointer to the data array of this matrix */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_storage.data(); }
 
   /** \returns a pointer to the data array of this matrix */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return m_storage.data(); }
 
   /** Resizes \c *this to a \a rows x \a cols matrix.
    *
@@ -324,7 +298,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    *
    * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) {
+  EIGEN_DEVICE_FUNC constexpr void resize(Index size) {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
     eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) ||
                   SizeAtCompileTime == size) &&
@@ -349,7 +323,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
+  EIGEN_DEVICE_FUNC constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
 
   /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special
    * value \c NoChange as in the example below.
@@ -359,7 +333,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    *
    * \sa resize(Index,Index)
    */
-  EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
+  EIGEN_DEVICE_FUNC constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
 
   /** Resizes \c *this to have the same dimensions as \a other.
    * Takes care of doing all the checking that's needed.
@@ -472,34 +446,19 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
   // Prevent user from trying to instantiate PlainObjectBase objects
   // by making all its constructor protected. See bug 1074.
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase() : m_storage() {
-    //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-  }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  // FIXME is it still needed ?
-  /** \internal */
-  EIGEN_DEVICE_FUNC constexpr explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
-      : m_storage(internal::constructor_without_unaligned_array_assert()) {
-    // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC constexpr PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
-      : m_storage(std::move(other.m_storage)) {}
-
-  EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase() = default;
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
+  /** \brief Move assignment operator */
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
     m_storage = std::move(other.m_storage);
     return *this;
   }
 
   /** Copy constructor */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(const PlainObjectBase& other)
-      : Base(), m_storage(other.m_storage) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(const PlainObjectBase&) = default;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
-      : m_storage(size, rows, cols) {
-    //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-  }
+      : m_storage(size, rows, cols) {}
 
   /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
    *
@@ -540,7 +499,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
       resize(list_size, ColsAtCompileTime);
       if (list.begin()->begin() != nullptr) {
-        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+        Index index = 0;
+        for (const Scalar& e : *list.begin()) {
+          coeffRef(index++) = e;
+        }
       }
     } else {
       eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 6bad832..e16c7cc 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -21,7 +21,7 @@ class ProductImpl;
 namespace internal {
 
 template <typename Lhs, typename Rhs, int Option>
-struct traits<Product<Lhs, Rhs, Option> > {
+struct traits<Product<Lhs, Rhs, Option>> {
   typedef remove_all_t<Lhs> LhsCleaned;
   typedef remove_all_t<Rhs> RhsCleaned;
   typedef traits<LhsCleaned> LhsTraits;
@@ -55,6 +55,129 @@ struct traits<Product<Lhs, Rhs, Option> > {
   };
 };
 
+struct TransposeProductEnum {
+  // convenience enumerations to specialize transposed products
+  enum : int {
+    Default = 0x00,
+    Matrix = 0x01,
+    Permutation = 0x02,
+    MatrixMatrix = (Matrix << 8) | Matrix,
+    MatrixPermutation = (Matrix << 8) | Permutation,
+    PermutationMatrix = (Permutation << 8) | Matrix
+  };
+};
+template <typename Xpr>
+struct TransposeKind {
+  static constexpr int Kind = is_matrix_base_xpr<Xpr>::value        ? TransposeProductEnum::Matrix
+                              : is_permutation_base_xpr<Xpr>::value ? TransposeProductEnum::Permutation
+                                                                    : TransposeProductEnum::Default;
+};
+
+template <typename Lhs, typename Rhs>
+struct TransposeProductKind {
+  static constexpr int Kind = (TransposeKind<Lhs>::Kind << 8) | TransposeKind<Rhs>::Kind;
+};
+
+template <typename Lhs, typename Rhs, int Option, int Kind = TransposeProductKind<Lhs, Rhs>::Kind>
+struct product_transpose_helper {
+  // by default, don't optimize the transposed product
+  using Derived = Product<Lhs, Rhs, Option>;
+  using Scalar = typename Derived::Scalar;
+  using TransposeType = Transpose<const Derived>;
+  using ConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<Scalar>, TransposeType>;
+  using AdjointType = std::conditional_t<NumTraits<Scalar>::IsComplex, ConjugateTransposeType, TransposeType>;
+
+  // return (lhs * rhs)^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(derived);
+  }
+  // return (lhs * rhs)^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(TransposeType(derived));
+  }
+};
+
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::MatrixMatrix> {
+  // expand the transposed matrix-matrix product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using LhsTransposeType = typename DenseBase<Lhs>::ConstTransposeReturnType;
+  using LhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<LhsScalar>, LhsTransposeType>;
+  using LhsAdjointType =
+      std::conditional_t<NumTraits<LhsScalar>::IsComplex, LhsConjugateTransposeType, LhsTransposeType>;
+
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using RhsTransposeType = typename DenseBase<Rhs>::ConstTransposeReturnType;
+  using RhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<RhsScalar>, RhsTransposeType>;
+  using RhsAdjointType =
+      std::conditional_t<NumTraits<RhsScalar>::IsComplex, RhsConjugateTransposeType, RhsTransposeType>;
+
+  using TransposeType = Product<RhsTransposeType, LhsTransposeType, Option>;
+  using AdjointType = Product<RhsAdjointType, LhsAdjointType, Option>;
+
+  // return rhs^T * lhs^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsTransposeType(derived.rhs()), LhsTransposeType(derived.lhs()));
+  }
+  // return rhs^H * lhs^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsAdjointType(RhsTransposeType(derived.rhs())),
+                       LhsAdjointType(LhsTransposeType(derived.lhs())));
+  }
+};
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::PermutationMatrix> {
+  // expand the transposed permutation-matrix product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsInverseType = typename PermutationBase<Lhs>::InverseReturnType;
+
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using RhsTransposeType = typename DenseBase<Rhs>::ConstTransposeReturnType;
+  using RhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<RhsScalar>, RhsTransposeType>;
+  using RhsAdjointType =
+      std::conditional_t<NumTraits<RhsScalar>::IsComplex, RhsConjugateTransposeType, RhsTransposeType>;
+
+  using TransposeType = Product<RhsTransposeType, LhsInverseType, Option>;
+  using AdjointType = Product<RhsAdjointType, LhsInverseType, Option>;
+
+  // return rhs^T * lhs^-1
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsTransposeType(derived.rhs()), LhsInverseType(derived.lhs()));
+  }
+  // return rhs^H * lhs^-1
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsAdjointType(RhsTransposeType(derived.rhs())), LhsInverseType(derived.lhs()));
+  }
+};
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::MatrixPermutation> {
+  // expand the transposed matrix-permutation product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using LhsTransposeType = typename DenseBase<Lhs>::ConstTransposeReturnType;
+  using LhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<LhsScalar>, LhsTransposeType>;
+  using LhsAdjointType =
+      std::conditional_t<NumTraits<LhsScalar>::IsComplex, LhsConjugateTransposeType, LhsTransposeType>;
+
+  using RhsInverseType = typename PermutationBase<Rhs>::InverseReturnType;
+
+  using TransposeType = Product<RhsInverseType, LhsTransposeType, Option>;
+  using AdjointType = Product<RhsInverseType, LhsAdjointType, Option>;
+
+  // return rhs^-1 * lhs^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsInverseType(derived.rhs()), LhsTransposeType(derived.lhs()));
+  }
+  // return rhs^-1 * lhs^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsInverseType(derived.rhs()), LhsAdjointType(LhsTransposeType(derived.lhs())));
+  }
+};
+
 }  // end namespace internal
 
 /** \class Product
@@ -93,17 +216,27 @@ class Product
   typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
   typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
 
+  using TransposeReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::TransposeType;
+  using AdjointReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::AdjointType;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
     eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" &&
                  "if you wanted a coeff-wise or a dot product use the respective explicit functions");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeReturnType transpose() const {
+    return internal::product_transpose_helper<Lhs, Rhs, Option>::run_transpose(*this);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointReturnType adjoint() const {
+    return internal::product_transpose_helper<Lhs, Rhs, Option>::run_adjoint(*this);
+  }
+
  protected:
   LhsNested m_lhs;
   RhsNested m_rhs;
@@ -112,12 +245,12 @@ class Product
 namespace internal {
 
 template <typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs, Rhs>::ret>
-class dense_product_base : public internal::dense_xpr_base<Product<Lhs, Rhs, Option> >::type {};
+class dense_product_base : public internal::dense_xpr_base<Product<Lhs, Rhs, Option>>::type {};
 
 /** Conversion to scalar for inner-products */
 template <typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
-    : public internal::dense_xpr_base<Product<Lhs, Rhs, Option> >::type {
+    : public internal::dense_xpr_base<Product<Lhs, Rhs, Option>>::type {
   typedef Product<Lhs, Rhs, Option> ProductXpr;
   typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
 
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 19c2560..be55be5 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -235,19 +235,20 @@ EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op, scalar_difference_op, add_assig
 
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, InnerProduct> {
+  using impl = default_inner_product_impl<Lhs, Rhs, false>;
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    dst.coeffRef(0, 0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+    dst.coeffRef(0, 0) = impl::run(lhs, rhs);
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    dst.coeffRef(0, 0) += (lhs.transpose().cwiseProduct(rhs)).sum();
+    dst.coeffRef(0, 0) += impl::run(lhs, rhs);
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    dst.coeffRef(0, 0) -= (lhs.transpose().cwiseProduct(rhs)).sum();
+    dst.coeffRef(0, 0) -= impl::run(lhs, rhs);
   }
 };
 
@@ -282,7 +283,7 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, cons
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
   template <typename T>
-  struct is_row_major : std::conditional_t<(int(T::Flags) & RowMajorBit), internal::true_type, internal::false_type> {};
+  struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
   typedef typename Product<Lhs, Rhs>::Scalar Scalar;
 
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
@@ -293,6 +294,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
     }
   };
   struct add {
+    /** Add to dst. */
     template <typename Dst, typename Src>
     EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += src;
@@ -304,9 +306,12 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
       dst.const_cast_derived() -= src;
     }
   };
+  /** Scaled add. */
   struct adds {
     Scalar m_scale;
+    /** Constructor */
     explicit adds(const Scalar& s) : m_scale(s) {}
+    /** Scaled add to dst. */
     template <typename Dst, typename Src>
     void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
@@ -440,7 +445,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductM
 
     eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
                       blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
-                      std::conditional_t<HasScalarFactor, true_type, false_type>());
+                      bool_constant<HasScalarFactor>());
   }
 
  protected:
@@ -630,6 +635,24 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     return packet<LoadMode, PacketType>(row, col);
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
+                                                                       Index count) const {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
+    PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
+    return res;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
  protected:
   add_const_on_value_type_t<LhsNested> m_lhs;
   add_const_on_value_type_t<RhsNested> m_rhs;
@@ -665,6 +688,13 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
     res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
                 rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
+  }
 };
 
 template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -676,6 +706,13 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
     res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
                 pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -684,6 +721,12 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
                                                         Index /*innerDim*/, Packet& res) {
     res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
+               rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -692,6 +735,12 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
                                                         Index /*innerDim*/, Packet& res) {
     res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
+               pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -700,6 +749,11 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> {
                                                         const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -708,6 +762,11 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> {
                                                         const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -718,6 +777,13 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
     for (Index i = 0; i < innerDim; ++i)
       res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
+                  res);
+  }
 };
 
 template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
@@ -728,6 +794,13 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
     for (Index i = 0; i < innerDim; ++i)
       res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
+                  res);
+  }
 };
 
 /***************************************************************************
@@ -773,7 +846,7 @@ struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
 
   template <typename Dest>
   static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::IsVectorAtCompileTime>::run(
+    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
         dst, lhs.nestedExpression(), rhs, alpha);
   }
 };
@@ -785,7 +858,7 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
 
   template <typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
-    selfadjoint_product_impl<Lhs, 0, Lhs::IsVectorAtCompileTime, typename Rhs::MatrixType, Rhs::Mode, false>::run(
+    selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
         dst, lhs, rhs.nestedExpression(), alpha);
   }
 };
@@ -810,7 +883,7 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
                     : (Derived::MaxColsAtCompileTime == 1 && Derived::MaxRowsAtCompileTime != 1) ? ColMajor
                     : MatrixFlags & RowMajorBit                                                  ? RowMajor
                                                                                                  : ColMajor,
-    SameStorageOrder_ = StorageOrder_ == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
+    SameStorageOrder_ = int(StorageOrder_) == ((MatrixFlags & RowMajorBit) ? RowMajor : ColMajor),
 
     ScalarAccessOnDiag_ = !((int(StorageOrder_) == ColMajor && int(ProductOrder) == OnTheLeft) ||
                             (int(StorageOrder_) == RowMajor && int(ProductOrder) == OnTheRight)),
@@ -866,6 +939,26 @@ struct diagonal_product_evaluator_base : evaluator_base<Derived> {
                           m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
   }
 
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::true_type) const {
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::false_type) const {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
+  }
+
   evaluator<DiagonalType> m_diagImpl;
   evaluator<MatrixType> m_matImpl;
 };
@@ -887,7 +980,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   typedef typename XprType::PlainObject PlainObject;
   typedef typename Lhs::DiagonalVectorType DiagonalType;
 
-  enum { StorageOrder = Base::StorageOrder_ };
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 
@@ -900,8 +994,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
     // See also similar calls below.
-    return this->template packet_impl<LoadMode, PacketType>(
-        row, col, row, std::conditional_t<int(StorageOrder) == RowMajor, internal::true_type, internal::false_type>());
+    return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
   }
 
   template <int LoadMode, typename PacketType>
@@ -909,6 +1002,19 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
     return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
                                         int(StorageOrder) == ColMajor ? 0 : idx);
   }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
+  }
 #endif
 };
 
@@ -928,7 +1034,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
 
-  enum { StorageOrder = Base::StorageOrder_ };
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
 
@@ -939,14 +1046,23 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
 #ifndef EIGEN_GPUCC
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
-    return this->template packet_impl<LoadMode, PacketType>(
-        row, col, col, std::conditional_t<int(StorageOrder) == ColMajor, internal::true_type, internal::false_type>());
+    return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
   }
 
   template <int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
-    return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
-                                        int(StorageOrder) == ColMajor ? 0 : idx);
+    return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
   }
 #endif
 };
@@ -1148,6 +1264,22 @@ struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, SkewSymmetricShape, Pr
   }
 };
 
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, typename Rhs::PlainObject, MatrixShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, MatrixShape, ProductTag>
+    : generic_product_impl<typename Lhs::PlainObject, Rhs, DenseShape, MatrixShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, PermutationShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag> {};
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/RandomImpl.h b/Eigen/src/Core/RandomImpl.h
new file mode 100644
index 0000000..1a82e62
--- /dev/null
+++ b/Eigen/src/Core/RandomImpl.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charles Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOM_IMPL_H
+#define EIGEN_RANDOM_IMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/****************************************************************************
+ * Implementation of random                                               *
+ ****************************************************************************/
+
+template <typename Scalar, bool IsComplex, bool IsInteger>
+struct random_default_impl {};
+
+template <typename Scalar>
+struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template <typename Scalar>
+struct random_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+}
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+}
+
+// TODO: replace or provide alternatives to this, e.g. std::random_device
+struct eigen_random_device {
+  using ReturnType = int;
+  static constexpr int Entropy = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value;
+  static constexpr ReturnType Highest = RAND_MAX;
+  static EIGEN_DEVICE_FUNC inline ReturnType run() { return std::rand(); }
+};
+
+// Fill a built-in unsigned integer with numRandomBits beginning with the least significant bit
+template <typename Scalar>
+struct random_bits_impl {
+  EIGEN_STATIC_ASSERT(std::is_unsigned<Scalar>::value, SCALAR MUST BE A BUILT - IN UNSIGNED INTEGER)
+  using RandomDevice = eigen_random_device;
+  using RandomReturnType = typename RandomDevice::ReturnType;
+  static constexpr int kEntropy = RandomDevice::Entropy;
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  // return a Scalar filled with numRandomBits beginning from the least significant bit
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert((numRandomBits >= 0) && (numRandomBits <= kTotalBits));
+    const Scalar mask = Scalar(-1) >> ((kTotalBits - numRandomBits) & (kTotalBits - 1));
+    Scalar randomBits = 0;
+    for (int shift = 0; shift < numRandomBits; shift += kEntropy) {
+      RandomReturnType r = RandomDevice::run();
+      randomBits |= static_cast<Scalar>(r) << shift;
+    }
+    // clear the excess bits
+    randomBits &= mask;
+    return randomBits;
+  }
+};
+
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline BitsType getRandomBits(int numRandomBits) {
+  return random_bits_impl<BitsType>::run(numRandomBits);
+}
+
+// random implementation for a built-in floating point type
+template <typename Scalar, bool BuiltIn = std::is_floating_point<Scalar>::value>
+struct random_float_impl {
+  using BitsType = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
+  static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() {
+    const int digits = NumTraits<Scalar>::digits();
+    return digits - 1;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    BitsType randomBits = getRandomBits<BitsType>(numRandomBits);
+    // if fewer than MantissaBits is requested, shift them to the left
+    randomBits <<= (mantissaBits() - numRandomBits);
+    // randomBits is in the half-open interval [2,4)
+    randomBits |= numext::bit_cast<BitsType>(Scalar(2));
+    // result is in the half-open interval [-1,1)
+    Scalar result = numext::bit_cast<Scalar>(randomBits) - Scalar(3);
+    return result;
+  }
+};
+// random implementation for a custom floating point type
+// uses double as the implementation with a mantissa with a size equal to either the target scalar's mantissa or that of
+// double, whichever is smaller
+template <typename Scalar>
+struct random_float_impl<Scalar, false> {
+  static EIGEN_DEVICE_FUNC inline int mantissaBits() {
+    const int digits = NumTraits<Scalar>::digits();
+    constexpr int kDoubleDigits = NumTraits<double>::digits();
+    return numext::mini(digits, kDoubleDigits) - 1;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    Scalar result = static_cast<Scalar>(random_float_impl<double>::run(numRandomBits));
+    return result;
+  }
+};
+
+#if !EIGEN_COMP_NVCC
+// random implementation for long double
+// this specialization is not compatible with double-double scalars
+template <bool Specialize = (sizeof(long double) == 2 * sizeof(uint64_t)) &&
+                            ((std::numeric_limits<long double>::digits != (2 * std::numeric_limits<double>::digits)))>
+struct random_longdouble_impl {
+  static constexpr int Size = sizeof(long double);
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<long double>::digits() - 1; }
+  static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    EIGEN_USING_STD(memcpy);
+    int numLowBits = numext::mini(numRandomBits, 64);
+    int numHighBits = numext::maxi(numRandomBits - 64, 0);
+    uint64_t randomBits[2];
+    long double result = 2.0L;
+    memcpy(&randomBits, &result, Size);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    randomBits[0] |= getRandomBits<uint64_t>(numLowBits);
+    randomBits[1] |= getRandomBits<uint64_t>(numHighBits);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    randomBits[0] |= getRandomBits<uint64_t>(numHighBits);
+    randomBits[1] |= getRandomBits<uint64_t>(numLowBits);
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
+    memcpy(&result, &randomBits, Size);
+    result -= 3.0L;
+    return result;
+  }
+};
+template <>
+struct random_longdouble_impl<false> {
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<double>::digits() - 1; }
+  static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
+    return static_cast<long double>(random_float_impl<double>::run(numRandomBits));
+  }
+};
+template <>
+struct random_float_impl<long double> : random_longdouble_impl<> {};
+#endif
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, false> {
+  using Impl = random_float_impl<Scalar>;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y, int numRandomBits) {
+    Scalar half_x = Scalar(0.5) * x;
+    Scalar half_y = Scalar(0.5) * y;
+    Scalar result = (half_x + half_y) + (half_y - half_x) * run(numRandomBits);
+    // result is in the half-open interval [x, y) -- provided that x < y
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    return run(x, y, Impl::mantissaBits());
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) { return Impl::run(numRandomBits); }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return run(Impl::mantissaBits()); }
+};
+
+template <typename Scalar, bool IsSigned = NumTraits<Scalar>::IsSigned, bool BuiltIn = std::is_integral<Scalar>::value>
+struct random_int_impl;
+
+// random implementation for a built-in unsigned integer type
+template <typename Scalar>
+struct random_int_impl<Scalar, false, true> {
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
+    Scalar range = y - x;
+    // handle edge case where [x,y] spans the entire range of Scalar
+    if (range == NumTraits<Scalar>::highest()) return run();
+    Scalar count = range + 1;
+    // calculate the number of random bits needed to fill range
+    int numRandomBits = log2_ceil(count);
+    Scalar randomBits;
+    do {
+      randomBits = getRandomBits<Scalar>(numRandomBits);
+      // if the random draw is outside [0, range), try again (rejection sampling)
+      // in the worst-case scenario, the probability of rejection is: 1/2 - 1/2^numRandomBits < 50%
+    } while (randomBits >= count);
+    Scalar result = x + randomBits;
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return getRandomBits<Scalar>(kTotalBits); }
+};
+
+// random implementation for a built-in signed integer type
+template <typename Scalar>
+struct random_int_impl<Scalar, true, true> {
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  using BitsType = typename make_unsigned<Scalar>::type;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
+    // Avoid overflow by representing `range` as an unsigned type
+    BitsType range = static_cast<BitsType>(y) - static_cast<BitsType>(x);
+    BitsType randomBits = random_int_impl<BitsType>::run(0, range);
+    // Avoid overflow in the case where `x` is negative and there is a large range so
+    // `randomBits` would also be negative if cast to `Scalar` first.
+    Scalar result = static_cast<Scalar>(static_cast<BitsType>(x) + randomBits);
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return static_cast<Scalar>(getRandomBits<BitsType>(kTotalBits)); }
+};
+
+// todo: custom integers
+template <typename Scalar, bool IsSigned>
+struct random_int_impl<Scalar, IsSigned, false> {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&, const Scalar&) { return run(); }
+  static EIGEN_DEVICE_FUNC inline Scalar run() {
+    eigen_assert(std::false_type::value && "RANDOM FOR CUSTOM INTEGERS NOT YET SUPPORTED");
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, true> : random_int_impl<Scalar> {};
+
+template <>
+struct random_impl<bool> {
+  static EIGEN_DEVICE_FUNC inline bool run(const bool& x, const bool& y) {
+    if (y <= x) return x;
+    return run();
+  }
+  static EIGEN_DEVICE_FUNC inline bool run() { return getRandomBits<unsigned>(1) ? true : false; }
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, true, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  using Impl = random_impl<RealScalar>;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y, int numRandomBits) {
+    return Scalar(Impl::run(x.real(), y.real(), numRandomBits), Impl::run(x.imag(), y.imag(), numRandomBits));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    return Scalar(Impl::run(x.real(), y.real()), Impl::run(x.imag(), y.imag()));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    return Scalar(Impl::run(numRandomBits), Impl::run(numRandomBits));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return Scalar(Impl::run(), Impl::run()); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_RANDOM_IMPL_H
diff --git a/Eigen/src/Core/RealView.h b/Eigen/src/Core/RealView.h
new file mode 100644
index 0000000..7ba42f9
--- /dev/null
+++ b/Eigen/src/Core/RealView.h
@@ -0,0 +1,250 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALVIEW_H
+#define EIGEN_REALVIEW_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Vectorized assignment to RealView requires array-oriented access to the real and imaginary components.
+// From https://en.cppreference.com/w/cpp/numeric/complex.html:
+// For any pointer to an element of an array of std::complex<T> named p and any valid array index i,
+// reinterpret_cast<T*>(p)[2 * i] is the real part of the complex number p[i], and
+// reinterpret_cast<T*>(p)[2 * i + 1] is the imaginary part of the complex number p[i].
+
+template <typename ComplexScalar>
+struct complex_array_access : std::false_type {};
+template <>
+struct complex_array_access<std::complex<float>> : std::true_type {};
+template <>
+struct complex_array_access<std::complex<double>> : std::true_type {};
+template <>
+struct complex_array_access<std::complex<long double>> : std::true_type {};
+
+template <typename Xpr>
+struct traits<RealView<Xpr>> : public traits<Xpr> {
+  template <typename T>
+  static constexpr int double_size(T size, bool times_two) {
+    int size_as_int = int(size);
+    if (size_as_int == Dynamic) return Dynamic;
+    return times_two ? (2 * size_as_int) : size_as_int;
+  }
+  using Base = traits<Xpr>;
+  using ComplexScalar = typename Base::Scalar;
+  using Scalar = typename NumTraits<ComplexScalar>::Real;
+  static constexpr int ActualDirectAccessBit = complex_array_access<ComplexScalar>::value ? DirectAccessBit : 0;
+  static constexpr int ActualPacketAccessBit = packet_traits<Scalar>::Vectorizable ? PacketAccessBit : 0;
+  static constexpr int FlagMask =
+      ActualDirectAccessBit | ActualPacketAccessBit | HereditaryBits | LinearAccessBit | LvalueBit;
+  static constexpr int BaseFlags = int(evaluator<Xpr>::Flags) | int(Base::Flags);
+  static constexpr int Flags = BaseFlags & FlagMask;
+  static constexpr bool IsRowMajor = Flags & RowMajorBit;
+  static constexpr int RowsAtCompileTime = double_size(Base::RowsAtCompileTime, !IsRowMajor);
+  static constexpr int ColsAtCompileTime = double_size(Base::ColsAtCompileTime, IsRowMajor);
+  static constexpr int SizeAtCompileTime = size_at_compile_time(RowsAtCompileTime, ColsAtCompileTime);
+  static constexpr int MaxRowsAtCompileTime = double_size(Base::MaxRowsAtCompileTime, !IsRowMajor);
+  static constexpr int MaxColsAtCompileTime = double_size(Base::MaxColsAtCompileTime, IsRowMajor);
+  static constexpr int MaxSizeAtCompileTime = size_at_compile_time(MaxRowsAtCompileTime, MaxColsAtCompileTime);
+  static constexpr int OuterStrideAtCompileTime = double_size(outer_stride_at_compile_time<Xpr>::ret, true);
+  static constexpr int InnerStrideAtCompileTime = inner_stride_at_compile_time<Xpr>::ret;
+};
+
+template <typename Xpr>
+struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
+  using BaseEvaluator = evaluator<Xpr>;
+  using XprType = RealView<Xpr>;
+  using ExpressionTraits = traits<XprType>;
+  using ComplexScalar = typename ExpressionTraits::ComplexScalar;
+  using ComplexCoeffReturnType = typename BaseEvaluator::CoeffReturnType;
+  using Scalar = typename ExpressionTraits::Scalar;
+
+  static constexpr bool IsRowMajor = ExpressionTraits::IsRowMajor;
+  static constexpr int Flags = ExpressionTraits::Flags;
+  static constexpr int CoeffReadCost = BaseEvaluator::CoeffReadCost;
+  static constexpr int Alignment = BaseEvaluator::Alignment;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(XprType realView) : BaseEvaluator(realView.m_xpr) {}
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return p ? numext::real(cscalar) : numext::imag(cscalar);
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index row, Index col) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
+  }
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    ComplexScalar& cscalar = BaseEvaluator::coeffRef(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
+    Index p = index & 1;
+    return p ? numext::real(cscalar) : numext::imag(cscalar);
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
+  }
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    ComplexScalar& cscalar = BaseEvaluator::coeffRef(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
+
+    Index crow = IsRowMajor ? row : row / 2;
+    Index ccol = IsRowMajor ? col / 2 : col;
+    ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(crow, ccol);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert((index % 2 == 0) && "the index must be even");
+
+    Index cindex = index / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(cindex);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
+    eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
+
+    Index crow = IsRowMajor ? row : row / 2;
+    Index ccol = IsRowMajor ? col / 2 : col;
+    Index cbegin = begin / 2;
+    Index ccount = count / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(crow, ccol, cbegin, ccount);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert((index % 2 == 0) && "the index must be even");
+    eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
+
+    Index cindex = index / 2;
+    Index cbegin = begin / 2;
+    Index ccount = count / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(cindex, cbegin, ccount);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+};
+
+}  // namespace internal
+
+template <typename Xpr>
+class RealView : public internal::dense_xpr_base<RealView<Xpr>>::type {
+  using ExpressionTraits = internal::traits<RealView>;
+  EIGEN_STATIC_ASSERT(NumTraits<typename Xpr::Scalar>::IsComplex, SCALAR MUST BE COMPLEX)
+ public:
+  using Scalar = typename ExpressionTraits::Scalar;
+  using Nested = RealView;
+
+  EIGEN_DEVICE_FUNC explicit RealView(Xpr& xpr) : m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return Xpr::IsRowMajor ? m_xpr.rows() : 2 * m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return Xpr::IsRowMajor ? 2 * m_xpr.cols() : m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return 2 * m_xpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_xpr.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 2 * m_xpr.outerStride(); }
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
+    m_xpr.resize(Xpr::IsRowMajor ? rows : rows / 2, Xpr::IsRowMajor ? cols / 2 : cols);
+  }
+  EIGEN_DEVICE_FUNC void resize(Index size) { m_xpr.resize(size / 2); }
+  EIGEN_DEVICE_FUNC Scalar* data() { return reinterpret_cast<Scalar*>(m_xpr.data()); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return reinterpret_cast<const Scalar*>(m_xpr.data()); }
+
+  EIGEN_DEVICE_FUNC RealView(const RealView&) = default;
+
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const DenseBase<OtherDerived>& other);
+
+ protected:
+  friend struct internal::evaluator<RealView<Xpr>>;
+  Xpr& m_xpr;
+};
+
+template <typename Xpr>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView<OtherDerived>& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const DenseBase<OtherDerived>& other) {
+  internal::call_assignment(*this, other.derived());
+  return *this;
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::RealViewReturnType DenseBase<Derived>::realView() {
+  return RealViewReturnType(derived());
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::ConstRealViewReturnType DenseBase<Derived>::realView() const {
+  return ConstRealViewReturnType(derived());
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_REALVIEW_H
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 0c5f2d9..4e9ab0e 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -414,6 +414,13 @@ class redux_evaluator : public internal::evaluator<XprType_> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const {
     return Base::template packet<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer);
   }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) const {
+    return Base::template packetSegment<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer,
+                                                              begin, count);
+  }
 };
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 129bc85..30ec277 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -73,11 +73,11 @@ class RefBase : public MapBase<Derived> {
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
            : IsVectorAtCompileTime                   ? this->size()
            : int(Flags) & RowMajorBit                ? this->cols()
@@ -97,11 +97,11 @@ class RefBase : public MapBase<Derived> {
   typedef Stride<StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime> StrideBase;
 
   // Resolves inner stride if default 0.
-  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
+  static EIGEN_DEVICE_FUNC constexpr Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
 
   // Resolves outer stride if default 0.
-  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
-                                                                    bool isVectorAtCompileTime, bool isRowMajor) {
+  static EIGEN_DEVICE_FUNC constexpr Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
+                                                              bool isVectorAtCompileTime, bool isRowMajor) {
     return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
   }
 
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index c01c627..3415045 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -80,16 +80,13 @@ class Replicate : public internal::dense_xpr_base<Replicate<MatrixType, RowFacto
 
   template <typename OriginalMatrixType>
   EIGEN_DEVICE_FUNC inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
-      : m_matrix(matrix),
-        m_rowFactor(rowFactor),
-        m_colFactor(colFactor){
-            EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
-                                THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)}
-
-        EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const {
-    return m_matrix.rows() * m_rowFactor.value();
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) {
+    EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
+                        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
   EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
 
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index b881dd6..22acdc0 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -173,7 +173,7 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, false>
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \sa MapBase::data() */
-  EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const;
   EIGEN_DEVICE_FUNC inline Index innerStride() const;
   EIGEN_DEVICE_FUNC inline Index outerStride() const;
 #endif
@@ -215,10 +215,10 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, true> : public MapBase<Resh
   EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
 
   /** \sa MapBase::innerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return m_xpr.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
 
   /** \sa MapBase::outerStride() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return (((Flags & RowMajorBit) == RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
   }
 
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 3b5e470..892c193 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -58,12 +58,8 @@ class ReturnByValue : public internal::dense_xpr_base<ReturnByValue<Derived> >::
   EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
     static_cast<const Derived*>(this)->evalTo(dst);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
-    return static_cast<const Derived*>(this)->rows();
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
-    return static_cast<const Derived*>(this)->cols();
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return static_cast<const Derived*>(this)->rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable \
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 66116aa..d11ba16 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -87,8 +87,8 @@ class Reverse : public internal::dense_xpr_base<Reverse<MatrixType, Direction> >
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
 
@@ -127,19 +127,25 @@ EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType DenseBas
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template <typename Derived>
 EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace() {
+  constexpr int HalfRowsAtCompileTime = RowsAtCompileTime == Dynamic ? Dynamic : RowsAtCompileTime / 2;
+  constexpr int HalfColsAtCompileTime = ColsAtCompileTime == Dynamic ? Dynamic : ColsAtCompileTime / 2;
   if (cols() > rows()) {
     Index half = cols() / 2;
-    leftCols(half).swap(rightCols(half).reverse());
+    this->template leftCols<HalfColsAtCompileTime>(half).swap(
+        this->template rightCols<HalfColsAtCompileTime>(half).reverse());
     if ((cols() % 2) == 1) {
       Index half2 = rows() / 2;
-      col(half).head(half2).swap(col(half).tail(half2).reverse());
+      col(half).template head<HalfRowsAtCompileTime>(half2).swap(
+          col(half).template tail<HalfRowsAtCompileTime>(half2).reverse());
     }
   } else {
     Index half = rows() / 2;
-    topRows(half).swap(bottomRows(half).reverse());
+    this->template topRows<HalfRowsAtCompileTime>(half).swap(
+        this->template bottomRows<HalfRowsAtCompileTime>(half).reverse());
     if ((rows() % 2) == 1) {
       Index half2 = cols() / 2;
-      row(half).head(half2).swap(row(half).tail(half2).reverse());
+      row(half).template head<HalfColsAtCompileTime>(half2).swap(
+          row(half).template tail<HalfColsAtCompileTime>(half2).reverse());
     }
   }
 }
diff --git a/Eigen/src/Core/Select.h b/Eigen/src/Core/Select.h
index 9f46120..61a67c2 100644
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@@ -15,7 +15,7 @@
 
 namespace Eigen {
 
-/** \class Select
+/** \typedef Select
  * \ingroup Core_Module
  *
  * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
@@ -24,73 +24,16 @@ namespace Eigen {
  * \tparam ThenMatrixType the type of the \em then expression
  * \tparam ElseMatrixType the type of the \em else expression
  *
- * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
+ * This type represents an expression of a coefficient wise version of the C++ ternary operator ?:.
  * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
  *
  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
  */
-
-namespace internal {
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > : traits<ThenMatrixType> {
-  typedef typename traits<ThenMatrixType>::Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef typename traits<ThenMatrixType>::XprKind XprKind;
-  typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
-  typedef typename ThenMatrixType::Nested ThenMatrixNested;
-  typedef typename ElseMatrixType::Nested ElseMatrixNested;
-  enum {
-    RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
-  };
-};
-}  // namespace internal
-
 template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : public internal::dense_xpr_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
-               internal::no_assignment_operator {
- public:
-  typedef typename internal::dense_xpr_base<Select>::type Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Select)
-
-  inline EIGEN_DEVICE_FUNC Select(const ConditionMatrixType& a_conditionMatrix, const ThenMatrixType& a_thenMatrix,
-                                  const ElseMatrixType& a_elseMatrix)
-      : m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix) {
-    eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
-    eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
-  }
-
-  inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
-  inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
-
-  inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i, Index j) const {
-    if (m_condition.coeff(i, j))
-      return m_then.coeff(i, j);
-    else
-      return m_else.coeff(i, j);
-  }
-
-  inline EIGEN_DEVICE_FUNC const Scalar coeff(Index i) const {
-    if (m_condition.coeff(i))
-      return m_then.coeff(i);
-    else
-      return m_else.coeff(i);
-  }
-
-  inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const { return m_condition; }
-
-  inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const { return m_then; }
-
-  inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const { return m_else; }
-
- protected:
-  typename ConditionMatrixType::Nested m_condition;
-  typename ThenMatrixType::Nested m_then;
-  typename ElseMatrixType::Nested m_else;
-};
+using Select = CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenMatrixType>::Scalar,
+                                                                 typename DenseBase<ElseMatrixType>::Scalar,
+                                                                 typename DenseBase<ConditionMatrixType>::Scalar>,
+                              ThenMatrixType, ElseMatrixType, ConditionMatrixType>;
 
 /** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
  * if \c *this(i,j) != Scalar(0), and \a elseMatrix(i,j) otherwise.
@@ -98,7 +41,7 @@ class Select : public internal::dense_xpr_base<Select<ConditionMatrixType, ThenM
  * Example: \include MatrixBase_select.cpp
  * Output: \verbinclude MatrixBase_select.out
  *
- * \sa DenseBase::bitwiseSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&)
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ThenDerived, typename ElseDerived>
@@ -107,15 +50,12 @@ inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
                                        typename DenseBase<Derived>::Scalar>,
     ThenDerived, ElseDerived, Derived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const {
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                typename DenseBase<ElseDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenDerived, ElseDerived, Derived>(thenMatrix.derived(), elseMatrix.derived(), derived(),
-                                                               Op());
+  return Select<Derived, ThenDerived, ElseDerived>(thenMatrix.derived(), elseMatrix.derived(), derived());
 }
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
  * the \em else expression being a scalar value.
  *
- * \sa DenseBase::booleanSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ThenDerived>
@@ -126,15 +66,13 @@ inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                            const typename DenseBase<ThenDerived>::Scalar& elseScalar) const {
   using ElseConstantType = typename DenseBase<ThenDerived>::ConstantReturnType;
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
-                                                typename DenseBase<ThenDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenDerived, ElseConstantType, Derived>(
-      thenMatrix.derived(), ElseConstantType(rows(), cols(), elseScalar), derived(), Op());
+  return Select<Derived, ThenDerived, ElseConstantType>(thenMatrix.derived(),
+                                                        ElseConstantType(rows(), cols(), elseScalar), derived());
 }
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
  * the \em then expression being a scalar value.
  *
- * \sa DenseBase::booleanSelect(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+ * \sa typedef Select
  */
 template <typename Derived>
 template <typename ElseDerived>
@@ -145,10 +83,8 @@ inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
 DenseBase<Derived>::select(const typename DenseBase<ElseDerived>::Scalar& thenScalar,
                            const DenseBase<ElseDerived>& elseMatrix) const {
   using ThenConstantType = typename DenseBase<ElseDerived>::ConstantReturnType;
-  using Op = internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
-                                                typename DenseBase<ElseDerived>::Scalar, Scalar>;
-  return CwiseTernaryOp<Op, ThenConstantType, ElseDerived, Derived>(ThenConstantType(rows(), cols(), thenScalar),
-                                                                    elseMatrix.derived(), derived(), Op());
+  return Select<Derived, ThenConstantType, ElseDerived>(ThenConstantType(rows(), cols(), thenScalar),
+                                                        elseMatrix.derived(), derived());
 }
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 4e9a923..16f0e75 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -73,10 +73,10 @@ class SelfAdjointView : public TriangularBase<SelfAdjointView<MatrixType_, UpLo>
 
   EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_matrix.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.innerStride(); }
 
   /** \sa MatrixBase::coeff()
    * \warning the coordinates must fit into the referenced triangular part
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 4dc92f1..1bc0373 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,33 +15,33 @@
 
 namespace Eigen {
 
-// TODO generalize the scalar type of 'other'
-
 template <typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::mul_assign_op<Scalar, Scalar>());
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::mul_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::add_assign_op<Scalar, Scalar>());
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const RealScalar& other) {
+  realView() *= other;
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::sub_assign_op<Scalar, Scalar>());
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::div_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
   return derived();
 }
 
 template <typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
-                            internal::div_assign_op<Scalar, Scalar>());
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const RealScalar& other) {
+  realView() /= other;
   return derived();
 }
 
diff --git a/Eigen/src/Core/SkewSymmetricMatrix3.h b/Eigen/src/Core/SkewSymmetricMatrix3.h
index b3fcc3a..3545afc 100644
--- a/Eigen/src/Core/SkewSymmetricMatrix3.h
+++ b/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -66,7 +66,7 @@ class SkewSymmetricBase : public EigenBase<Derived> {
   EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
 
   /** Determinant vanishes */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Scalar determinant() const { return 0; }
+  EIGEN_DEVICE_FUNC constexpr Scalar determinant() const { return 0; }
 
   /** A.transpose() = -A */
   EIGEN_DEVICE_FUNC PlainObject transpose() const { return (-vector()).asSkewSymmetric(); }
@@ -91,9 +91,9 @@ class SkewSymmetricBase : public EigenBase<Derived> {
   EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return derived().vector(); }
 
   /** \returns the number of rows. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return 3; }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return 3; }
   /** \returns the number of columns. */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return 3; }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return 3; }
 
   /** \returns the matrix product of \c *this by the dense matrix, \a matrix */
   template <typename MatrixDerived>
@@ -321,7 +321,7 @@ bool MatrixBase<Derived>::isSkewSymmetric(const RealScalar& prec) const {
   return (this->transpose() + *this).isZero(prec);
 }
 
-/** \returns the matrix product of \c *this by the skew symmetric matrix \skew.
+/** \returns the matrix product of \c *this by the skew symmetric matrix \a skew.
  */
 template <typename Derived>
 template <typename SkewDerived>
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index dfea9c6..aa51410 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -66,8 +66,8 @@ class Solve : public SolveImpl<Decomposition, RhsType, typename internal::traits
 
   Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 26d62ff..9d31874 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -216,8 +216,8 @@ struct triangular_solve_retval : public ReturnByValue<triangular_solve_retval<Si
 
   triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) : m_triangularMatrix(tri), m_rhs(rhs) {}
 
-  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
-  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  constexpr Index rows() const noexcept { return m_rhs.rows(); }
+  constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   inline void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index df2ac83..5a6dfd4 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -78,6 +78,14 @@ class SolverBase : public EigenBase<Derived> {
   template <typename Derived_>
   friend struct internal::solve_assertion;
 
+  ComputationInfo info() const {
+    // CRTP static dispatch: Calls the 'info()' method on the derived class.
+    // Derived must implement 'ComputationInfo info() const'.
+    // If not implemented, name lookup falls back to this base method, causing
+    // infinite recursion (detectable by -Winfinite-recursion).
+    return derived().info();
+  }
+
   enum {
     RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
     ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 6513120..711ee3f 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -48,34 +48,16 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
 
 template <typename VectorType, typename RealScalar>
 void stable_norm_impl_inner_step(const VectorType& vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) {
-  typedef typename VectorType::Scalar Scalar;
   const Index blockSize = 4096;
 
-  typedef typename internal::nested_eval<VectorType, 2>::type VectorTypeCopy;
-  typedef internal::remove_all_t<VectorTypeCopy> VectorTypeCopyClean;
-  const VectorTypeCopy copy(vec);
-
-  enum {
-    CanAlign =
-        ((int(VectorTypeCopyClean::Flags) & DirectAccessBit) ||
-         (int(internal::evaluator<VectorTypeCopyClean>::Alignment) > 0)  // FIXME Alignment)>0 might not be enough
-         ) &&
-        (blockSize * sizeof(Scalar) * 2 < EIGEN_STACK_ALLOCATION_LIMIT) &&
-        (EIGEN_MAX_STATIC_ALIGN_BYTES >
-         0)  // if we cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef std::conditional_t<
-      CanAlign,
-      Ref<const Matrix<Scalar, Dynamic, 1, 0, blockSize, 1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
-      typename VectorTypeCopyClean::ConstSegmentReturnType>
-      SegmentWrapper;
   Index n = vec.size();
-
-  Index bi = internal::first_default_aligned(copy);
-  if (bi > 0) internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi < n; bi += blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi, numext::mini(blockSize, n - bi))), ssq, scale,
-                                 invScale);
+  Index blockEnd = numext::round_down(n, blockSize);
+  for (Index i = 0; i < blockEnd; i += blockSize) {
+    internal::stable_norm_kernel(vec.template segment<blockSize>(i), ssq, scale, invScale);
+  }
+  if (n > blockEnd) {
+    internal::stable_norm_kernel(vec.tail(n - blockEnd), ssq, scale, invScale);
+  }
 }
 
 template <typename VectorType>
@@ -85,8 +67,7 @@ typename VectorType::RealScalar stable_norm_impl(const VectorType& vec,
   using std::sqrt;
 
   Index n = vec.size();
-
-  if (n == 1) return abs(vec.coeff(0));
+  if (EIGEN_PREDICT_FALSE(n == 1)) return abs(vec.coeff(0));
 
   typedef typename VectorType::RealScalar RealScalar;
   RealScalar scale(0);
@@ -218,7 +199,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   return internal::blueNorm_impl(*this);
 }
 
-/** \returns the \em l2 norm of \c *this avoiding undeflow and overflow.
+/** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
  * This version use a concatenation of hypot() calls, and it is very slow.
  *
  * \sa norm(), stableNorm()
diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h
index 3ab7d21..a24d4c2 100644
--- a/Eigen/src/Core/StlIterators.h
+++ b/Eigen/src/Core/StlIterators.h
@@ -36,11 +36,11 @@ class indexed_based_stl_iterator_base {
   typedef Index difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 
-  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
+  indexed_based_stl_iterator_base() noexcept : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) noexcept : mp_xpr(&xpr), m_index(index) {}
 
-  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW : mp_xpr(other.mp_xpr),
-                                                                                    m_index(other.m_index) {}
+  indexed_based_stl_iterator_base(const non_const_iterator& other) noexcept
+      : mp_xpr(other.mp_xpr), m_index(other.m_index) {}
 
   indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) {
     mp_xpr = other.mp_xpr;
@@ -325,19 +325,24 @@ class pointer_based_stl_iterator {
  public:
   typedef Index difference_type;
   typedef typename XprType::Scalar value_type;
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L
+  typedef std::conditional_t<XprType::InnerStrideAtCompileTime == 1, std::contiguous_iterator_tag,
+                             std::random_access_iterator_tag>
+      iterator_category;
+#else
   typedef std::random_access_iterator_tag iterator_category;
+#endif
   typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
   typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
 
-  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) {
+  pointer_based_stl_iterator() noexcept : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
     m_ptr = xpr.data() + index * m_incr.value();
   }
 
-  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW : m_ptr(other.m_ptr),
-                                                                               m_incr(other.m_incr) {}
+  pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
 
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW {
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) noexcept {
     m_ptr = other.m_ptr;
     m_incr.setValue(other.m_incr);
     return *this;
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index a8fdeaf..692f0a1 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -70,10 +70,17 @@ class Stride {
   /** Copy constructor */
   EIGEN_DEVICE_FUNC Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
 
+  /** Copy assignment operator */
+  EIGEN_DEVICE_FUNC Stride& operator=(const Stride& other) {
+    m_outer.setValue(other.outer());
+    m_inner.setValue(other.inner());
+    return *this;
+  }
+
   /** \returns the outer stride */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index outer() const { return m_outer.value(); }
   /** \returns the inner stride */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index inner() const { return m_inner.value(); }
 
  protected:
   internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index d417c1a..dd825e9 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -65,6 +65,31 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
     Index col = Base::colIndexByOuterInner(outer, inner);
     assignPacket<StoreMode, LoadMode, PacketType>(row, col);
   }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
+  }
+
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // mean no CRTP (Gael)
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
+  }
 };
 
 }  // namespace internal
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 1cc7a28..0676a25 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -65,8 +65,8 @@ class Transpose : public TransposeImpl<MatrixType, typename internal::traits<Mat
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
 
   /** \returns the nested expression */
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
@@ -119,10 +119,12 @@ class TransposeImpl<MatrixType, Dense> : public internal::TransposeImpl_base<Mat
 
   typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarWithConstIfNotLvalue* data() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr ScalarWithConstIfNotLvalue* data() {
+    return derived().nestedExpression().data();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar* data() const {
     return derived().nestedExpression().data();
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return derived().nestedExpression().data(); }
 
   // FIXME: shall we keep the const version of coeffRef?
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const {
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index ad136d3..f6dd258 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -91,7 +91,7 @@ class TranspositionsBase {
   /** \returns the inverse transformation */
   inline Transpose<TranspositionsBase> inverse() const { return Transpose<TranspositionsBase>(derived()); }
 
-  /** \returns the tranpose transformation */
+  /** \returns the transpose transformation */
   inline Transpose<TranspositionsBase> transpose() const { return Transpose<TranspositionsBase>(derived()); }
 
  protected:
@@ -293,9 +293,9 @@ class Transpose<TranspositionsBase<TranspositionsDerived> > {
  public:
   explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_transpositions.size(); }
 
   /** \returns the \a matrix with the inverse transpositions applied to the columns.
    */
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 2b1683b..27ad78e 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -58,10 +58,10 @@ class TriangularBase : public EigenBase<Derived> {
     eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag))));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
 
   // dummy resize function
   EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
@@ -194,9 +194,9 @@ class TriangularView
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
   /** \copydoc EigenBase::rows() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
   /** \copydoc EigenBase::cols() */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** \returns a const reference to the nested expression */
   EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 9887db6..688b49b 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -36,6 +36,7 @@ template <typename MatrixType, typename MemberOp, int Direction>
 class PartialReduxExpr;
 
 namespace internal {
+
 template <typename MatrixType, typename MemberOp, int Direction>
 struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
   typedef typename MemberOp::result_type Scalar;
@@ -63,12 +64,8 @@ class PartialReduxExpr : public internal::dense_xpr_base<PartialReduxExpr<Matrix
   EIGEN_DEVICE_FUNC explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
-    return (Direction == Vertical ? 1 : m_matrix.rows());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
-    return (Direction == Horizontal ? 1 : m_matrix.cols());
-  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return (Direction == Vertical ? 1 : m_matrix.rows()); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return (Direction == Horizontal ? 1 : m_matrix.cols()); }
 
   EIGEN_DEVICE_FUNC typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 
@@ -149,6 +146,22 @@ struct member_redux {
   const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
+
+template <typename Scalar>
+struct scalar_replace_zero_with_one_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x) const {
+    return numext::is_exactly_zero(x) ? Scalar(1) : x;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return pselect(pcmp_eq(x, pzero(x)), pset1<Packet>(Scalar(1)), x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_replace_zero_with_one_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
 }  // namespace internal
 
 /** \class VectorwiseOp
@@ -193,9 +206,7 @@ class VectorwiseOp {
  public:
   typedef typename ExpressionType::Scalar Scalar;
   typedef typename ExpressionType::RealScalar RealScalar;
-  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
-  typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
-  typedef internal::remove_all_t<ExpressionTypeNested> ExpressionTypeNestedCleaned;
+  typedef internal::remove_all_t<ExpressionType> ExpressionTypeCleaned;
 
   template <template <typename OutScalar, typename InputScalar> class Functor, typename ReturnScalar = Scalar>
   struct ReturnType {
@@ -334,7 +345,7 @@ class VectorwiseOp {
 
   typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
   typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-  typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,
+  typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeCleaned>,
                            internal::member_sum<RealScalar, RealScalar>, Direction>
       SquaredNormReturnType;
   typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
@@ -585,7 +596,7 @@ class VectorwiseOp {
   /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
   template <typename OtherDerived>
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-      CwiseBinaryOp<internal::scalar_sum_op<Scalar, typename OtherDerived::Scalar>, const ExpressionTypeNestedCleaned,
+      CwiseBinaryOp<internal::scalar_sum_op<Scalar, typename OtherDerived::Scalar>, const ExpressionTypeCleaned,
                     const typename ExtendedType<OtherDerived>::Type>
       operator+(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -596,7 +607,7 @@ class VectorwiseOp {
   /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
   template <typename OtherDerived>
   EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_difference_op<Scalar, typename OtherDerived::Scalar>,
-                                  const ExpressionTypeNestedCleaned, const typename ExtendedType<OtherDerived>::Type>
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
   operator-(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
@@ -606,10 +617,9 @@ class VectorwiseOp {
   /** Returns the expression where each subvector is the product of the vector \a other
    * by the corresponding subvector of \c *this */
   template <typename OtherDerived>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-      CwiseBinaryOp<internal::scalar_product_op<Scalar>, const ExpressionTypeNestedCleaned,
-                    const typename ExtendedType<OtherDerived>::Type> EIGEN_DEVICE_FUNC
-      operator*(const DenseBase<OtherDerived>& other) const {
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_product_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+  operator*(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
     EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
@@ -619,8 +629,8 @@ class VectorwiseOp {
   /** Returns the expression where each subvector is the quotient of the corresponding
    * subvector of \c *this by the vector \a other */
   template <typename OtherDerived>
-  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
-                                  const typename ExtendedType<OtherDerived>::Type>
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
   operator/(const DenseBase<OtherDerived>& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
     EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
@@ -628,18 +638,28 @@ class VectorwiseOp {
     return m_matrix / extendedTo(other.derived());
   }
 
+  using Normalized_NonzeroNormType =
+      CwiseUnaryOp<internal::scalar_replace_zero_with_one_op<Scalar>, const NormReturnType>;
+  using NormalizedReturnType = CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeCleaned,
+                                             const typename OppositeExtendedType<Normalized_NonzeroNormType>::Type>;
+
   /** \returns an expression where each column (or row) of the referenced matrix are normalized.
    * The referenced matrix is \b not modified.
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they remain unchanged in the
+   *          resulting expression.
+   *
    * \sa MatrixBase::normalized(), normalize()
    */
-  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeNestedCleaned,
-                                  const typename OppositeExtendedType<NormReturnType>::Type>
-  normalized() const {
-    return m_matrix.cwiseQuotient(extendedToOpposite(this->norm()));
+  EIGEN_DEVICE_FUNC NormalizedReturnType normalized() const {
+    return m_matrix.cwiseQuotient(extendedToOpposite(Normalized_NonzeroNormType(this->norm())));
   }
 
   /** Normalize in-place each row or columns of the referenced matrix.
-   * \sa MatrixBase::normalize(), normalized()
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they are left unchanged.
+   *
+   * \sa MatrixBase::normalized(), normalize()
    */
   EIGEN_DEVICE_FUNC void normalize() { m_matrix = this->normalized(); }
 
@@ -683,7 +703,7 @@ class VectorwiseOp {
 
  protected:
   EIGEN_DEVICE_FUNC Index redux_length() const { return Direction == Vertical ? m_matrix.rows() : m_matrix.cols(); }
-  ExpressionTypeNested m_matrix;
+  ExpressionType& m_matrix;
 };
 
 // const colwise moved to DenseBase.h due to CUDA compiler bug
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 037a605..e1d2ca5 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -25,20 +25,18 @@ struct visitor_impl;
 template <typename Visitor, bool ShortCircuitEvaluation = false>
 struct short_circuit_eval_impl {
   // if short circuit evaluation is not used, do nothing
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
 };
 template <typename Visitor>
 struct short_circuit_eval_impl<Visitor, true> {
   // if short circuit evaluation is used, check the visitor
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) {
-    return visitor.done();
-  }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { return visitor.done(); }
 };
 
 // unrolled inner-outer traversal
 template <typename Visitor, typename Derived, int UnrollCount, bool Vectorize, bool ShortCircuitEvaluation>
 struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, false, ShortCircuitEvaluation> {
-  // don't use short circuit evaulation for unrolled version
+  // don't use short circuit evaluation for unrolled version
   using Scalar = typename Derived::Scalar;
   using Packet = typename packet_traits<Scalar>::type;
   static constexpr bool RowMajor = Derived::IsRowMajor;
@@ -93,7 +91,7 @@ struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, false, ShortCircui
 // unrolled linear traversal
 template <typename Visitor, typename Derived, int UnrollCount, bool Vectorize, bool ShortCircuitEvaluation>
 struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, true, ShortCircuitEvaluation> {
-  // don't use short circuit evaulation for unrolled version
+  // don't use short circuit evaluation for unrolled version
   using Scalar = typename Derived::Scalar;
   using Packet = typename packet_traits<Scalar>::type;
   static constexpr int PacketSize = packet_traits<Scalar>::size;
@@ -296,9 +294,9 @@ class visitor_evaluator {
 
   EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType& xpr) : m_evaluator(xpr), m_xpr(xpr) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_xpr.size(); }
   // outer-inner access
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
     return m_evaluator.coeff(row, col);
@@ -386,173 +384,6 @@ EIGEN_DEVICE_FUNC void DenseBase<Derived>::visit(Visitor& visitor) const {
 
 namespace internal {
 
-/** \internal
- * \brief Base class to implement min and max visitors
- */
-template <typename Derived>
-struct coeff_visitor {
-  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
-  EIGEN_DEVICE_FUNC coeff_visitor() : row(-1), col(-1), res(0) {}
-  typedef typename Derived::Scalar Scalar;
-  Index row, col;
-  Scalar res;
-  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index i, Index j) {
-    res = value;
-    row = i;
-    col = j;
-  }
-};
-
-template <typename Scalar, int NaNPropagation, bool is_min = true>
-struct minmax_compare {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a < b; }
-  static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_min<NaNPropagation>(p); }
-};
-
-template <typename Scalar, int NaNPropagation>
-struct minmax_compare<Scalar, NaNPropagation, false> {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a > b; }
-  static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_max<NaNPropagation>(p); }
-};
-
-// Default implementation used by non-floating types, where we do not
-// need special logic for NaN handling.
-template <typename Derived, bool is_min, int NaNPropagation,
-          bool isInt = NumTraits<typename Derived::Scalar>::IsInteger>
-struct minmax_coeff_visitor : coeff_visitor<Derived> {
-  using Scalar = typename Derived::Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, NaNPropagation, is_min>;
-  static constexpr Index PacketSize = packet_traits<Scalar>::size;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    if (Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    Scalar value = Comparator::predux(p);
-    if (Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      Packet mask = pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    Scalar value = Comparator::predux(p);
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    Packet mask = pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-// Suppress NaN. The only case in which we return NaN is if the matrix is all NaN,
-// in which case, row=0, col=0 is returned for the location.
-template <typename Derived, bool is_min>
-struct minmax_coeff_visitor<Derived, is_min, PropagateNumbers, false> : coeff_visitor<Derived> {
-  typedef typename Derived::Scalar Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, PropagateNumbers, is_min>;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      /* mask will be zero for NaNs, so they will be ignored. */
-      Packet mask = pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    if ((numext::isnan)(value)) {
-      this->res = value;
-      this->row = 0;
-      this->col = 0;
-      return;
-    }
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    /* mask will be zero for NaNs, so they will be ignored. */
-    Packet mask = pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-// Propagate NaNs. If the matrix contains NaN, the location of the first NaN
-// will be returned in row and col.
-template <typename Derived, bool is_min, int NaNPropagation>
-struct minmax_coeff_visitor<Derived, is_min, NaNPropagation, false> : coeff_visitor<Derived> {
-  typedef typename Derived::Scalar Scalar;
-  using Packet = typename packet_traits<Scalar>::type;
-  using Comparator = minmax_compare<Scalar, PropagateNaN, is_min>;
-
-  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index i, Index j) {
-    const bool value_is_nan = (numext::isnan)(value);
-    if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      this->res = value;
-      this->row = i;
-      this->col = j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    const bool value_is_nan = (numext::isnan)(value);
-    if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) {
-      const Packet range = preverse(plset<Packet>(Scalar(1)));
-      // If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
-      Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
-      Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-      this->res = value;
-      this->row = Derived::IsRowMajor ? i : i + max_idx;
-      this->col = Derived::IsRowMajor ? j + max_idx : j;
-    }
-  }
-  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index i, Index j) {
-    const Index PacketSize = packet_traits<Scalar>::size;
-    Scalar value = Comparator::predux(p);
-    const bool value_is_nan = (numext::isnan)(value);
-    const Packet range = preverse(plset<Packet>(Scalar(1)));
-    // If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value.
-    Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1<Packet>(value), p);
-    Index max_idx = PacketSize - static_cast<Index>(predux_max(pand(range, mask)));
-    this->res = value;
-    this->row = Derived::IsRowMajor ? i : i + max_idx;
-    this->col = Derived::IsRowMajor ? j + max_idx : j;
-  }
-};
-
-template <typename Derived, bool is_min, int NaNPropagation>
-struct functor_traits<minmax_coeff_visitor<Derived, is_min, NaNPropagation>> {
-  using Scalar = typename Derived::Scalar;
-  enum { Cost = NumTraits<Scalar>::AddCost, LinearAccess = false, PacketAccess = packet_traits<Scalar>::HasCmp };
-};
-
 template <typename Scalar>
 struct all_visitor {
   using result_type = bool;
@@ -632,101 +463,18 @@ struct functor_traits<count_visitor<Scalar>> {
   };
 };
 
-}  // end namespace internal
-
-/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
- * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowId,
-                                                                                          IndexType* colId) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
-  this->visit(minVisitor);
-  *rowId = minVisitor.row;
-  if (colId) *colId = minVisitor.col;
-  return minVisitor.res;
-}
-
-/** \returns the minimum of all coefficients of *this and puts in *index its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
- * DenseBase::minCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* index) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-
-  internal::minmax_coeff_visitor<Derived, true, NaNPropagation> minVisitor;
-  this->visit(minVisitor);
-  *index = IndexType((RowsAtCompileTime == 1) ? minVisitor.col : minVisitor.row);
-  return minVisitor.res;
-}
-
-/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
- * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
- */
+template <typename Derived, bool AlwaysTrue = NumTraits<typename traits<Derived>::Scalar>::IsInteger>
+struct all_finite_impl {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& /*derived*/) { return true; }
+};
+#if !defined(__FINITE_MATH_ONLY__) || !(__FINITE_MATH_ONLY__)
 template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
-                                                                                          IndexType* colPtr) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
-  this->visit(maxVisitor);
-  *rowPtr = maxVisitor.row;
-  if (colPtr) *colPtr = maxVisitor.col;
-  return maxVisitor.res;
-}
+struct all_finite_impl<Derived, false> {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& derived) { return derived.array().isFiniteTyped().all(); }
+};
+#endif
 
-/** \returns the maximum of all coefficients of *this and puts in *index its location.
- *
- * In case \c *this contains NaN, NaNPropagation determines the behavior:
- *   NaNPropagation == PropagateFast : undefined
- *   NaNPropagation == PropagateNaN : result is NaN
- *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
- * \warning the matrix must be not empty, otherwise an assertion is triggered.
- *
- * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
- * DenseBase::maxCoeff()
- */
-template <typename Derived>
-template <int NaNPropagation, typename IndexType>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* index) const {
-  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
-
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::minmax_coeff_visitor<Derived, false, NaNPropagation> maxVisitor;
-  this->visit(maxVisitor);
-  *index = (RowsAtCompileTime == 1) ? maxVisitor.col : maxVisitor.row;
-  return maxVisitor.res;
-}
+}  // end namespace internal
 
 /** \returns true if all coefficients are true
  *
@@ -781,7 +529,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::hasNaN() const {
  */
 template <typename Derived>
 EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::allFinite() const {
-  return derived().array().isFinite().all();
+  return internal::all_finite_impl<Derived>::run(derived());
 }
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index bae5714..a4a87c4 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -85,10 +85,14 @@ EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
-  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
-  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
-  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) {
+  __m256 tmp1 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
+  __m256 tmp2 = _mm256_moveldup_ps(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256 result = _mm256_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+  __m256 result = _mm256_addsub_ps(_mm256_mul_ps(tmp2, b.v), tmp1);
+#endif
   return Packet4cf(result);
 }
 
@@ -121,11 +125,11 @@ EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(_mm256_load_ps(&numext::real_ref(*from)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(_mm256_loadu_ps(&numext::real_ref(*from)));
 }
 
 template <>
@@ -145,11 +149,11 @@ EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* fro
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(&numext::real_ref(*to), from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(&numext::real_ref(*to), from.v);
 }
 
 template <>
@@ -283,13 +287,15 @@ EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
-  __m256d tmp1 = _mm256_shuffle_pd(a.v, a.v, 0x0);
-  __m256d even = _mm256_mul_pd(tmp1, b.v);
-  __m256d tmp2 = _mm256_shuffle_pd(a.v, a.v, 0xF);
-  __m256d tmp3 = _mm256_shuffle_pd(b.v, b.v, 0x5);
-  __m256d odd = _mm256_mul_pd(tmp2, tmp3);
-  return Packet2cd(_mm256_addsub_pd(even, odd));
+EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) {
+  __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0xF), _mm256_permute_pd(b.v, 0x5));
+  __m256d tmp2 = _mm256_movedup_pd(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256d result = _mm256_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
+  __m256d result = _mm256_addsub_pd(_mm256_mul_pd(tmp2, b.v), tmp1);
+#endif
+  return Packet2cd(result);
 }
 
 template <>
@@ -321,11 +327,11 @@ EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(_mm256_load_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(_mm256_loadu_pd((const double*)from));
 }
 
 template <>
@@ -342,11 +348,11 @@ EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* fr
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd((double*)to, from.v);
 }
 
 template <>
@@ -449,6 +455,109 @@ EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(const Packet4cf& a) {
   return pexp_complex<Packet4cf>(a);
 }
 
+#ifdef EIGEN_VECTORIZE_FMA
+// std::complex<float>
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  __m256 a_odd = _mm256_movehdup_ps(a.v);
+  __m256 a_even = _mm256_moveldup_ps(a.v);
+  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmaddsub_ps(a_odd, b_swap, c.v));
+  return Packet4cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  __m256 a_odd = _mm256_movehdup_ps(a.v);
+  __m256 a_even = _mm256_moveldup_ps(a.v);
+  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmsubadd_ps(a_odd, b_swap, c.v));
+  return Packet4cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+// std::complex<double>
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
+  __m256d a_even = _mm256_movedup_pd(a.v);
+  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
+  __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmaddsub_pd(a_odd, b_swap, c.v));
+  return Packet2cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
+  __m256d a_even = _mm256_movedup_pd(a.v);
+  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
+  __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmsubadd_pd(a_odd, b_swap, c.v));
+  return Packet2cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  return pnegate(pmadd(a, b, c));
+}
+#endif
+
+/*---------------- load/store segment support ----------------*/
+
+/*---------------- std::complex<float> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cf> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4cf> : std::true_type {};
+
+template <>
+inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
+                                                           Index count) {
+  _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
+}
+
+template <>
+inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
+                                                           Index count) {
+  _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
+}
+
+/*---------------- std::complex<double> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cd> : std::true_type {};
+
+template <>
+inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
+  return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+                                                            Index begin, Index count) {
+  _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
+}
+
+/*---------------- end load/store segment support ----------------*/
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index b125d59..5b7285f 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -22,7 +22,19 @@ namespace Eigen {
 namespace internal {
 
 EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet8f)
-EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4d)
+
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
+#ifdef EIGEN_VECTORIZE_AVX2
+EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
+#endif
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
 
 // Notice that for newer processors, it is counterproductive to use Newton
 // iteration for square root. In particular, Skylake and Zen2 processors
@@ -85,6 +97,7 @@ EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& expone
 
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp2)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
@@ -94,8 +107,11 @@ BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
@@ -105,6 +121,7 @@ F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+#endif
 
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 3b94af5..0cd9e6c 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -118,17 +118,16 @@ struct packet_traits<float> : default_packet_traits {
     HasLog1p = 1,
     HasExpm1 = 1,
     HasExp = 1,
+    HasPow = 1,
     HasNdtri = 1,
     HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasBlend = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasErfc = EIGEN_FAST_MATH,
+    HasBlend = 1
   };
 };
 template <>
@@ -142,16 +141,22 @@ struct packet_traits<double> : default_packet_traits {
 
     HasCmp = 1,
     HasDiv = 1,
+#ifdef EIGEN_VECTORIZE_AVX2
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#endif
+    HasTanh = EIGEN_FAST_MATH,
     HasLog = 1,
+    HasErf = 1,
+    HasErfc = 1,
     HasExp = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasATan = 1,
-    HasBlend = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasATanh = 1,
+    HasBlend = 1
   };
 };
 
@@ -188,10 +193,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
     HasBessel = 1,
     HasNdtri = 1
   };
@@ -231,10 +232,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
     HasBessel = 1,
     HasNdtri = 1
   };
@@ -330,7 +327,9 @@ template <>
 struct unpacket_traits<Packet4d> {
   typedef double type;
   typedef Packet2d half;
+#ifdef EIGEN_VECTORIZE_AVX2
   typedef Packet4l integer_packet;
+#endif
   enum {
     size = 4,
     alignment = Aligned32,
@@ -560,7 +559,7 @@ EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmeti
 }
 template <int N>
 EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
-  return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
+  return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
 }
 template <int N>
 EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
@@ -657,16 +656,7 @@ template <>
 EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
   return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
 }
-template <>
-EIGEN_STRONG_INLINE int64_t predux<Packet4l>(const Packet4l& a) {
-  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-  return _mm_extract_epi64_0(r) + _mm_extract_epi64_1(r);
-}
-template <>
-EIGEN_STRONG_INLINE uint64_t predux<Packet4ul>(const Packet4ul& a) {
-  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-  return numext::bit_cast<uint64_t>(_mm_extract_epi64_0(r) + _mm_extract_epi64_1(r));
-}
+
 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
   __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
@@ -1190,7 +1180,7 @@ EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
 }
 #endif
 
-// Add specializations for min/max with prescribed NaN progation.
+// Add specializations for min/max with prescribed NaN propagation.
 template <>
 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
@@ -1251,6 +1241,15 @@ EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) {
   return _mm256_floor_pd(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrunc<Packet8f>(const Packet8f& a) {
+  return _mm256_round_ps(a, _MM_FROUND_TRUNC);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d ptrunc<Packet4d>(const Packet4d& a) {
+  return _mm256_round_pd(a, _MM_FROUND_TRUNC);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
 #ifdef EIGEN_VECTORIZE_AVX2
@@ -1802,14 +1801,12 @@ EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
 // pabs should be ok
 template <>
 EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
-  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
-                                                              0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
   return _mm256_and_ps(a, mask);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF,
-                                                              0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
   return _mm256_and_pd(a, mask);
 }
 template <>
@@ -1827,30 +1824,37 @@ EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
   return a;
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
-  return _mm_srai_epi16(a, 15);
+  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
 }
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
-  return _mm_srai_epi16(a, 15);
+  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
-  return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a)));
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
+#else
+  return _mm256_castsi256_ps(parithmetic_shift_right<31>(Packet8i(_mm256_castps_si256(a))));
+#endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) {
-  return pzero(a);
+EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& /*unused*/) {
+  return _mm256_setzero_si256();
 }
 #ifdef EIGEN_VECTORIZE_AVX2
 template <>
 EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
-  return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a)));
+  return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
 }
 template <>
-EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) {
-  return pzero(a);
+EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& /*unused*/) {
+  return _mm256_setzero_si256();
 }
 #endif
 
@@ -1919,20 +1923,19 @@ EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d&
 }
 
 template <>
-EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
-  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) {
-  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a) {
-  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
-  return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
+EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-1024, 1024]
+  const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
+  const Packet4d max_exponent = pset1<Packet4d>(1024.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
+  const Packet4i bias = pset1<Packet4i>(1023);
+
+  // 2^e
+  Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
+  const Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  return pmul(a, c);  // a * 2^e
 }
 
 template <>
@@ -1948,65 +1951,6 @@ EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a)
   return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }
 
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) {
-  Packet8f tmp;
-  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) {
-  Packet4d tmp;
-  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) {
-  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) {
-  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) {
-  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
-  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
-  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) {
-  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
-  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
-// {
-//   return _mm256_movemask_ps(x)==0xFF;
-// }
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) {
-  return _mm256_movemask_ps(x) != 0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) {
-  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
-}
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) {
-  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
-}
-
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
@@ -2126,40 +2070,29 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
+  return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
+                           0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
+  return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
+                          0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
+                          0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
                                     const Packet8f& elsePacket) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i select =
-      _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
-                       ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m256i false_mask = _mm256_cmpeq_epi32(zero, select);
-  return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask));
-#else
-  const __m256 zero = _mm256_setzero_ps();
-  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
-                                      ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
-  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
-#endif
+  const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
+  return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
                                     const Packet4d& elsePacket) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i select =
-      _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m256i false_mask = _mm256_cmpeq_epi64(select, zero);
-  return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask));
-#else
-  const __m256d zero = _mm256_setzero_pd();
-  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
-  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
-#endif
+  const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
+  return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
 }
 
 // Packet math for Eigen::half
@@ -2176,7 +2109,6 @@ struct unpacket_traits<Packet8h> {
   };
   typedef Packet8h half;
 };
-#endif
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
@@ -2314,24 +2246,69 @@ EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
   return float2half(pfloor<Packet8f>(half2float(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+  return float2half(ptrunc<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+// convert the sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
+  return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
+}
+
+// return true if both `a` and `b` are not NaN
+EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  __m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
+  __m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
+  // check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
+  // SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
+  return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isEqual);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_andnot_si128(isGreater, isOrdered);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isLess);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
-  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+  __m128i isUnordered = por(pisnan(a), pisnan(b));
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_or_si128(isUnordered, isLess);
 }
 
 template <>
@@ -2370,6 +2347,26 @@ EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b
   return float2half(rf);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
@@ -2406,36 +2403,6 @@ EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const
   to[stride * 7] = aux[7];
 }
 
-#ifndef EIGEN_VECTORIZE_AVX512FP16
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-#endif
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template <>
-EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
   __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@@ -2513,6 +2480,8 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
   kernel.packet[3] = pload<Packet8h>(out[3]);
 }
 
+#endif
+
 // BFloat16 implementation.
 
 EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
@@ -2689,6 +2658,11 @@ EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
   return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(ptrunc<Packet8f>(Bf16ToF32(a)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
   return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
@@ -2735,6 +2709,26 @@ EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8b
   return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
@@ -2767,26 +2761,6 @@ EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packe
   to[stride * 7] = aux[7];
 }
 
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
-  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
   __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
@@ -2848,6 +2822,258 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
 }
 
+/*---------------- load/store segment support ----------------*/
+
+// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  long long mask = 1;
+  mask <<= CHAR_BIT * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_8x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 8);
+  long long mask = 1;
+  // avoid UB when count == 8
+  mask <<= (CHAR_BIT / 2) * count;
+  mask <<= (CHAR_BIT / 2) * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if defined(_WIN32) && !defined(_WIN64)
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x32(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_2x64(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 2);
+  return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_8x32(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_8x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
+#else
+  __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
+  __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
+  __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
+#endif
+  return mask_epi32;
+}
+
+// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_4x64(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_4x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
+#else
+  __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
+  __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
+  __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
+#endif
+  return mask_epi64;
+}
+
+/*---------------- float ----------------*/
+
+template <>
+struct has_packet_segment<Packet4f> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8f> : std::true_type {};
+
+template <>
+inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
+  return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
+  _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
+  return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
+  _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
+}
+
+/*---------------- int32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4i> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8i> : std::true_type {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
+}
+
+#else
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
+}
+
+#endif
+
+/*---------------- uint32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ui> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8ui> : std::true_type {};
+
+template <>
+inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
+}
+
+template <>
+inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
+}
+
+/*---------------- double ----------------*/
+
+template <>
+struct has_packet_segment<Packet2d> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4d> : std::true_type {};
+
+template <>
+inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
+  return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
+  _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
+}
+
+template <>
+inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
+  return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
+  _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/*---------------- int64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet2l> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4l> : std::true_type {};
+
+template <>
+inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
+  return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
+  _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
+}
+template <>
+inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
+  return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
+  _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
+}
+
+/*---------------- uint64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ul> : std::true_type {};
+
+template <>
+inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
+  return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
+}
+template <>
+inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
+  pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
+}
+#endif
+
+/*---------------- end load/store segment support ----------------*/
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/Reductions.h b/Eigen/src/Core/arch/AVX/Reductions.h
new file mode 100644
index 0000000..237617c
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/Reductions.h
@@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX_H
+#define EIGEN_REDUCTIONS_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) {
+  Packet2l lo = _mm256_castsi256_si128(a);
+  Packet2l hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ul -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) {
+  return static_cast<uint64_t>(predux(Packet4l(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+  return _mm256_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4d& a) {
+  return _mm256_movemask_pd(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8h -- -- -- -- -- -- -- -- -- -- -- -- */
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet8h& a) {
+  return static_cast<half>(predux(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet8h& a) {
+  return static_cast<half>(predux_mul(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) {
+  return static_cast<half>(predux_min(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) {
+  return static_cast<half>(predux_max(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8h& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 2581eff..5b73ffe 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -200,10 +200,38 @@ EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
 #if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
   return _mm256_cvttpd_epi64(a);
 #else
-  EIGEN_ALIGN16 double aux[4];
-  pstore(aux, a);
-  return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
-                           static_cast<int64_t>(aux[0]));
+
+  // if 'a' exceeds the numerical limits of int64_t, the behavior is undefined
+
+  // e <= 0 corresponds to |a| < 1, which should result in zero. incidentally, intel intrinsics with shift arguments
+  // greater than or equal to 64 produce zero. furthermore, negative shifts appear to be interpreted as large positive
+  // shifts (two's complement), which also result in zero. therefore, e does not need to be clamped to [0, 64)
+
+  constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
+                kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
+
+  const __m256i cst_one = _mm256_set1_epi64x(1);
+  const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits);
+  const __m256i cst_bias = _mm256_set1_epi64x(kBias);
+
+  __m256i a_bits = _mm256_castpd_si256(a);
+  // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
+  __m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1);
+  __m256i e = _mm256_sub_epi64(biased_e, cst_bias);
+
+  // shift to the left by kExponentBits + 1 to clear the sign and exponent bits
+  __m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1);
+  // shift to the right by kTotalBits - e to convert the significand to an integer
+  __m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e));
+
+  // add the implied bit
+  __m256i result_exponent = _mm256_sllv_epi64(cst_one, e);
+  // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
+  __m256i result = _mm256_add_epi64(result_significand, result_exponent);
+  // handle negative arguments
+  __m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits);
+  result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask);
+  return result;
 #endif
 }
 
@@ -251,19 +279,21 @@ EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(const Packet4l& a)
 }
 #endif
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
   return half2float(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
-  return Bf16ToF32(a);
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
 }
+#endif
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
-  return float2half(a);
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 0677248..04499a0 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,16 +47,16 @@ EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exp
 
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) {
-  return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x));
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
-  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
 #else
-  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
 #endif
 }
 #else
@@ -80,19 +80,19 @@ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
 #elif EIGEN_FAST_MATH
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) {
-  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x));
 }
 #endif
 
 // prsqrt for double.
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
-  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
 #else
-  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
 #endif
 }
 
@@ -108,6 +108,7 @@ EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
 
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp2)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
@@ -117,8 +118,11 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
@@ -128,6 +132,7 @@ F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+#endif  // EIGEN_VECTORIZE_AVX512FP16
 
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
new file mode 100644
index 0000000..240ade4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+  __m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a));
+  result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1);
+  return _mm512_castsi512_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+  a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x)));
+  b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1));
+}
+
+#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func)                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) {    \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \
+    Packet16h low;                                                    \
+    Packet16h high;                                                   \
+    extract2Packet16h(a, low, high);                                  \
+    return combine2Packet16h(func(low), func(high));                  \
+  }
+
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh)
+#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
\ No newline at end of file
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index ed2f189..b76c8a7 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -34,12 +34,16 @@ namespace internal {
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
-// TODO(rmlarsen): Add support for Packet8l.
+typedef eigen_packet_wrapper<__m512i, 1> Packet8l;
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
 #endif
 typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
 
+typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
+typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
+typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
+
 template <>
 struct is_arithmetic<__m512> {
   enum { value = true };
@@ -52,6 +56,10 @@ template <>
 struct is_arithmetic<__m512d> {
   enum { value = true };
 };
+template <>
+struct is_arithmetic<Packet8l> {
+  enum { value = true };
+};
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
@@ -93,11 +101,7 @@ struct packet_traits<half> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasBlend = 0
   };
 };
 #endif
@@ -124,21 +128,20 @@ struct packet_traits<float> : default_packet_traits {
     HasATanh = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
     HasNdtri = 1,
     HasBessel = 1,
     HasExp = 1,
+    HasPow = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasCmp = 1,
-    HasDiv = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasDiv = 1
   };
 };
 template <>
@@ -152,15 +155,19 @@ struct packet_traits<double> : default_packet_traits {
     HasBlend = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
     HasExp = 1,
+    HasPow = 1,
     HasATan = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasATanh = 1,
     HasCmp = 1,
-    HasDiv = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasDiv = 1
   };
 };
 
@@ -171,6 +178,13 @@ struct packet_traits<int> : default_packet_traits {
   enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
 };
 
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet8l type;
+  typedef Packet4l half;
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 8 };
+};
+
 template <>
 struct unpacket_traits<Packet16f> {
   typedef float type;
@@ -190,6 +204,7 @@ template <>
 struct unpacket_traits<Packet8d> {
   typedef double type;
   typedef Packet4d half;
+  typedef Packet8l integer_packet;
   typedef uint8_t mask_t;
   enum {
     size = 8,
@@ -213,6 +228,19 @@ struct unpacket_traits<Packet16i> {
   };
 };
 
+template <>
+struct unpacket_traits<Packet8l> {
+  typedef int64_t type;
+  typedef Packet4l half;
+  enum {
+    size = 8,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 struct unpacket_traits<Packet16h> {
@@ -228,6 +256,39 @@ struct unpacket_traits<Packet16h> {
 };
 #endif
 
+template <>
+struct unpacket_traits<Packet32s> {
+  typedef numext::int16_t type;
+  typedef Packet16s half;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = false,
+  };
+};
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
   return _mm512_set1_ps(from);
@@ -240,6 +301,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
   return _mm512_set1_epi32(from);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pset1<Packet8l>(const int64_t& from) {
+  return _mm512_set1_epi64(from);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
@@ -264,6 +329,11 @@ EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
   return _mm512_setzero_si512();
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l pzero(const Packet8l& /*a*/) {
+  return _mm512_setzero_si512();
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
   return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
@@ -276,6 +346,10 @@ template <>
 EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
   return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l peven_mask(const Packet8l& /*a*/) {
+  return _mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
@@ -313,6 +387,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
   return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
+  return _mm512_add_epi64(_mm512_set1_epi64(a), _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0));
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
@@ -326,6 +404,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_add_epi32(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l padd<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_add_epi64(a, b);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
@@ -350,6 +432,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_sub_epi32(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l psub<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_sub_epi64(a, b);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@@ -372,6 +458,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
   return _mm512_sub_epi32(_mm512_setzero_si512(), a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pnegate(const Packet8l& a) {
+  return _mm512_sub_epi64(_mm512_setzero_si512(), a);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
@@ -385,6 +475,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
   return a;
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pconj(const Packet8l& a) {
+  return a;
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
@@ -398,6 +492,14 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_mullo_epi32(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pmul<Packet8l>(const Packet8l& a, const Packet8l& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_mullo_epi64(a, b);
+#else
+  return _mm512_mullox_epi64(a, b);
+#endif
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
@@ -466,6 +568,12 @@ EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16
   return _mm512_mask_blend_epi32(mask16, a, b);
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline Packet8l pselect(const Packet8l& mask, const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask8 = _mm512_cmpeq_epi64_mask(mask, _mm512_setzero_si512());
+  return _mm512_mask_blend_epi64(mask8, a, b);
+}
+
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
   __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
@@ -486,6 +594,10 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_min_epi32(b, a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pmin<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_min_epi64(b, a);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
@@ -501,8 +613,12 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_max_epi32(b, a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pmax<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_max_epi64(b, a);
+}
 
-// Add specializations for min/max with prescribed NaN progation.
+// Add specializations for min/max with prescribed NaN propagation.
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
@@ -593,46 +709,62 @@ EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
 template <>
 EIGEN_STRONG_INLINE Packet16f pisnan(const Packet16f& a) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_UNORD_Q);
-  return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, int32_t(-1)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
-  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
-  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
-  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_eq(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_le(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LE);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_lt(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
 }
 
 template <>
@@ -683,9 +815,23 @@ EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
   return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrunc<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrunc<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
-  return _mm512_set1_epi32(0xffffffffu);
+  return _mm512_set1_epi32(int32_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l ptrue<Packet8l>(const Packet8l& /*a*/) {
+  return _mm512_set1_epi64(int64_t(-1));
 }
 
 template <>
@@ -703,6 +849,11 @@ EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16
   return _mm512_and_si512(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l pand<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_and_si512(a, b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -732,6 +883,11 @@ EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i
   return _mm512_or_si512(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l por<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_or_si512(a, b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -755,6 +911,11 @@ EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16
   return _mm512_xor_si512(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l pxor<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_xor_si512(a, b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -778,6 +939,11 @@ EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packe
   return _mm512_andnot_si512(b, a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l pandnot<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -825,6 +991,21 @@ EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
   return _mm512_slli_epi32(a, N);
 }
 
+template <int N>
+EIGEN_STRONG_INLINE Packet8l parithmetic_shift_right(Packet8l a) {
+  return _mm512_srai_epi64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8l plogical_shift_right(Packet8l a) {
+  return _mm512_srli_epi64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8l plogical_shift_left(Packet8l a) {
+  return _mm512_slli_epi64(a, N);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -835,7 +1016,11 @@ EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(reinterpret_cast<const __m512i*>(from));
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pload<Packet8l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
 }
 
 template <>
@@ -848,7 +1033,11 @@ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
 }
 
 template <>
@@ -868,42 +1057,35 @@ template <>
 EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
   // an unaligned load is required here as there is no requirement
   // on the alignment of input pointer 'from'
-  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m256i low_half = _mm256_castps_si256(_mm256_loadu_ps(from));
   __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
   __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
   return pairs;
 }
 
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-// FIXME: this does not look optimal, better load a Packet4d and shuffle...
-// Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
+// Loads 4 doubles from memory a returns the packet {a0, a0,  a1, a1, a2, a2, a3,
 // a3}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  __m512d x = _mm512_setzero_pd();
-  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
-  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
-  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
-  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
-  return x;
+  Packet8d tmp = _mm512_castpd256_pd512(ploadu<Packet4d>(from));
+  const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
+  return _mm512_permutexvar_pd(scatter_mask, tmp);
 }
-#else
+
+// Loads 4 int64_t from memory a returns the packet {a0, a0,  a1, a1, a2, a2, a3,
+// a3}
 template <>
-EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  __m512d x = _mm512_setzero_pd();
-  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 0, _mm_load_sd(from + 0));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 2, _mm_load_sd(from + 1));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 4, _mm_load_sd(from + 2));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 6, _mm_load_sd(from + 3));
-  return x;
+EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
+  Packet8l tmp = _mm512_castsi256_si512(ploadu<Packet4l>(from));
+  const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
+  return _mm512_permutexvar_epi64(scatter_mask, tmp);
 }
-#endif
 
 // Loads 8 integers from memory and returns the packet
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
 EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
-  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m256i low_half = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
   __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
   __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
   return _mm512_castps_si512(pairs);
@@ -929,6 +1111,17 @@ EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
   return _mm512_insertf64x4(tmp, lane1, 1);
 }
 
+// Loads 2 int64_t from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
+  __m256i lane0 = _mm256_set1_epi64x(*from);
+  __m256i lane1 = _mm256_set1_epi64x(*(from + 1));
+  __m512i tmp = _mm512_undefined_epi32();
+  tmp = _mm512_inserti64x4(tmp, lane0, 0);
+  return _mm512_inserti64x4(tmp, lane1, 1);
+}
+
 // Loads 4 integers from memory and returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
 template <>
@@ -948,7 +1141,11 @@ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet8l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi64(to, from);
 }
 
 template <>
@@ -961,7 +1158,11 @@ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
@@ -1015,6 +1216,14 @@ EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
   return _mm512_i32gather_pd(indices, from, 8);
 }
 template <>
+EIGEN_DEVICE_FUNC inline Packet8l pgather<int64_t, Packet8l>(const int64_t* from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_epi64(indices, from, 8);
+}
+template <>
 EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -1043,7 +1252,6 @@ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packe
   __mmask8 mask = static_cast<__mmask8>(umask);
   _mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
 }
-
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
@@ -1059,6 +1267,13 @@ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packe
   _mm512_i32scatter_pd(to, indices, from, 8);
 }
 template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet8l>(int64_t* to, const Packet8l& from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_epi64(to, indices, from, 8);
+}
+template <>
 EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -1081,6 +1296,11 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
   Packet16i pa = pset1<Packet16i>(a);
   pstore(to, pa);
 }
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8l>(int64_t* to, const int64_t& a) {
+  Packet8l pa = pset1<Packet8l>(a);
+  pstore(to, pa);
+}
 
 template <>
 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
@@ -1097,15 +1317,24 @@ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
 
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
-  return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
+  return _mm512_cvtss_f32(a);
 }
 template <>
 EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
-  return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
+  return _mm512_cvtsd_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet8l>(const Packet8l& a) {
+  int64_t x = _mm_extract_epi64_0(_mm512_extracti32x4_epi32(a, 0));
+  return x;
 }
 template <>
 EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
-  return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
+#if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0)
+  return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+#else
+  return _mm512_cvtsi512_si32(a);
+#endif
 }
 
 template <>
@@ -1123,6 +1352,11 @@ EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
   return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l preverse(const Packet8l& a) {
+  return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
   // _mm512_abs_ps intrinsic not found, so hack around it
@@ -1137,11 +1371,18 @@ template <>
 EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
   return _mm512_abs_epi32(a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8l pabs(const Packet8l& a) {
+  return _mm512_abs_epi64(a);
+}
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
   return _mm256_srai_epi16(a, 15);
 }
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
   return _mm256_srai_epi16(a, 15);
@@ -1255,51 +1496,6 @@ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d&
   OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
 #endif
 
-template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
-  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f x = _mm256_add_ps(lane0, lane1);
-  return predux<Packet8f>(x);
-#else
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
-  sum = _mm_hadd_ps(sum, sum);
-  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
-  return _mm_cvtss_f32(sum);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d sum = _mm256_add_pd(lane0, lane1);
-  __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
-  return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux<Packet16i>(const Packet16i& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
-  __m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
-  Packet8i x = _mm256_add_epi32(lane0, lane1);
-  return predux<Packet8i>(x);
-#else
-  __m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
-  __m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
-  __m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
-  __m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
-  __m128i sum = _mm_add_epi32(_mm_add_epi32(lane0, lane1), _mm_add_epi32(lane2, lane3));
-  sum = _mm_hadd_epi32(sum, sum);
-  sum = _mm_hadd_epi32(sum, _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(sum), 1)));
-  return _mm_cvtsi128_si32(sum);
-#endif
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -1340,84 +1536,10 @@ EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-// #ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#else
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
-  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = _mm256_min_pd(lane0, lane1);
-  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
-  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  __m256d res = _mm256_max_pd(lane0, lane1);
-  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) {
-  Packet16i xi = _mm512_castps_si512(x);
-  __mmask16 tmp = _mm512_test_epi32_mask(xi, xi);
-  return !_mm512_kortestz(tmp, tmp);
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x) {
-  __mmask16 tmp = _mm512_test_epi32_mask(x, x);
-  return !_mm512_kortestz(tmp, tmp);
+EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
+  __m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
+  __m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
+  return _mm256_add_epi64(lane0, lane1);
 }
 
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1617,6 +1739,10 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
   OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
   OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
 
+#define PACK_OUTPUT_L(OUTPUT, INPUT, INDEX, STRIDE)                         \
+  OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
+  OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
   __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
   __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
@@ -1695,6 +1821,88 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
   kernel.packet[7] = T7;
 }
 
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
+  __m512i T0 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0));
+  __m512i T1 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0xff));
+  __m512i T2 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0));
+  __m512i T3 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0xff));
+
+  PacketBlock<Packet4l, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x31);
+
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 3, 1);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
+  __m512i T0 = _mm512_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
+  __m512i T1 = _mm512_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
+  __m512i T2 = _mm512_unpacklo_epi64(kernel.packet[2], kernel.packet[3]);
+  __m512i T3 = _mm512_unpackhi_epi64(kernel.packet[2], kernel.packet[3]);
+  __m512i T4 = _mm512_unpacklo_epi64(kernel.packet[4], kernel.packet[5]);
+  __m512i T5 = _mm512_unpackhi_epi64(kernel.packet[4], kernel.packet[5]);
+  __m512i T6 = _mm512_unpacklo_epi64(kernel.packet[6], kernel.packet[7]);
+  __m512i T7 = _mm512_unpackhi_epi64(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_permutex_epi64(T2, 0x4E);
+  kernel.packet[0] = _mm512_mask_blend_epi64(0xCC, T0, kernel.packet[0]);
+  kernel.packet[2] = _mm512_permutex_epi64(T0, 0x4E);
+  kernel.packet[2] = _mm512_mask_blend_epi64(0xCC, kernel.packet[2], T2);
+  kernel.packet[1] = _mm512_permutex_epi64(T3, 0x4E);
+  kernel.packet[1] = _mm512_mask_blend_epi64(0xCC, T1, kernel.packet[1]);
+  kernel.packet[3] = _mm512_permutex_epi64(T1, 0x4E);
+  kernel.packet[3] = _mm512_mask_blend_epi64(0xCC, kernel.packet[3], T3);
+  kernel.packet[4] = _mm512_permutex_epi64(T6, 0x4E);
+  kernel.packet[4] = _mm512_mask_blend_epi64(0xCC, T4, kernel.packet[4]);
+  kernel.packet[6] = _mm512_permutex_epi64(T4, 0x4E);
+  kernel.packet[6] = _mm512_mask_blend_epi64(0xCC, kernel.packet[6], T6);
+  kernel.packet[5] = _mm512_permutex_epi64(T7, 0x4E);
+  kernel.packet[5] = _mm512_mask_blend_epi64(0xCC, T5, kernel.packet[5]);
+  kernel.packet[7] = _mm512_permutex_epi64(T5, 0x4E);
+  kernel.packet[7] = _mm512_mask_blend_epi64(0xCC, kernel.packet[7], T7);
+
+  T0 = _mm512_shuffle_i64x2(kernel.packet[4], kernel.packet[4], 0x4E);
+  T0 = _mm512_mask_blend_epi64(0xF0, kernel.packet[0], T0);
+  T4 = _mm512_shuffle_i64x2(kernel.packet[0], kernel.packet[0], 0x4E);
+  T4 = _mm512_mask_blend_epi64(0xF0, T4, kernel.packet[4]);
+  T1 = _mm512_shuffle_i64x2(kernel.packet[5], kernel.packet[5], 0x4E);
+  T1 = _mm512_mask_blend_epi64(0xF0, kernel.packet[1], T1);
+  T5 = _mm512_shuffle_i64x2(kernel.packet[1], kernel.packet[1], 0x4E);
+  T5 = _mm512_mask_blend_epi64(0xF0, T5, kernel.packet[5]);
+  T2 = _mm512_shuffle_i64x2(kernel.packet[6], kernel.packet[6], 0x4E);
+  T2 = _mm512_mask_blend_epi64(0xF0, kernel.packet[2], T2);
+  T6 = _mm512_shuffle_i64x2(kernel.packet[2], kernel.packet[2], 0x4E);
+  T6 = _mm512_mask_blend_epi64(0xF0, T6, kernel.packet[6]);
+  T3 = _mm512_shuffle_i64x2(kernel.packet[7], kernel.packet[7], 0x4E);
+  T3 = _mm512_mask_blend_epi64(0xF0, kernel.packet[3], T3);
+  T7 = _mm512_shuffle_i64x2(kernel.packet[3], kernel.packet[3], 0x4E);
+  T7 = _mm512_mask_blend_epi64(0xF0, T7, kernel.packet[7]);
+
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
+}
+
 #define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
   EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
 
@@ -1849,27 +2057,29 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
   PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
 }
 
+template <size_t N>
+EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) {
+  alignas(__m128i) uint8_t aux[sizeof(__m128i)];
+  for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]);
+  __m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux)));
+  return _mm_movemask_epi8(paux);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
                                      const Packet16f& elsePacket) {
-  __mmask16 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
-                (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
-                (ifPacket.select[6] << 6) | (ifPacket.select[7] << 7) | (ifPacket.select[8] << 8) |
-                (ifPacket.select[9] << 9) | (ifPacket.select[10] << 10) | (ifPacket.select[11] << 11) |
-                (ifPacket.select[12] << 12) | (ifPacket.select[13] << 13) | (ifPacket.select[14] << 14) |
-                (ifPacket.select[15] << 15);
+  __mmask16 m = avx512_blend_mask(ifPacket);
   return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
                                     const Packet8d& elsePacket) {
-  __mmask8 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
-               (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
-               (ifPacket.select[6] << 6) | (ifPacket.select[7] << 7);
+  __mmask8 m = avx512_blend_mask(ifPacket);
   return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }
 
 // Packet math for Eigen::half
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
   return _mm256_set1_epi16(from.x);
@@ -1894,6 +2104,7 @@ template <>
 EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_ALIGNED_STORE
   _mm256_store_si256((__m256i*)(void*)to, from);
 }
 
@@ -1901,6 +2112,7 @@ template <>
 EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_UNALIGNED_STORE
   _mm256_storeu_si256((__m256i*)(void*)to, from);
 }
 
@@ -2002,6 +2214,11 @@ EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
   return float2half(pfloor<Packet16f>(half2float(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+  return float2half(ptrunc<Packet16f>(half2float(a)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
@@ -2035,7 +2252,6 @@ EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
   return _mm256_xor_si256(a, sign_mask);
 }
 
-#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
@@ -2069,38 +2285,30 @@ EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16
 }
 
 template <>
-EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux(from_float));
+EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
 }
 
-#endif
-
 template <>
-EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
-  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
-  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
-  return padd<Packet8h>(lane0, lane1);
+EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
-  Packet16f af = half2float(a);
-  float reduced = predux_max<Packet16f>(af);
-  return Eigen::half(reduced);
+EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
-  Packet16f af = half2float(a);
-  float reduced = predux_min<Packet16f>(af);
-  return Eigen::half(reduced);
+EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux_mul(from_float));
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8h>(lane0, lane1);
 }
 
 template <>
@@ -2309,6 +2517,8 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
   kernel.packet[3] = pload<Packet16h>(out[3]);
 }
 
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
 template <>
 struct is_arithmetic<Packet16bf> {
   enum { value = true };
@@ -2380,11 +2590,13 @@ EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
 
 template <>
 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
   _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
@@ -2501,6 +2713,11 @@ EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
   return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrunc<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(ptrunc<Packet16f>(Bf16ToF32(a)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
   return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
@@ -2550,54 +2767,54 @@ EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
-  return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+  return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
-  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
-  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
-  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
-  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
+EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
-  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
-  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
-  return padd<Packet8bf>(lane0, lane1);
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
-  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
 }
 
 template <>
-EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
-  return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
+EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8bf>(lane0, lane1);
 }
 
 template <>
@@ -2756,6 +2973,172 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
   kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
 }
 
+// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
+  return _mm512_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
+  return _mm256_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
+  return _mm_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  _mm512_store_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm_store_si128(reinterpret_cast<__m128i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm512_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm256_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
+  return _mm512_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
+  return _mm_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
+  return _mm512_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
+  return _mm_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
+  return _mm512_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
+  return _mm512_sub_epi16(_mm512_setzero_si512(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
+  return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
+  return _mm512_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
+  return _mm256_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+  return _mm_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
+  return _mm512_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
+  return _mm256_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+  return _mm_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
+  return _mm512_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
+  return _mm256_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+  return _mm_srli_epi16(a, N);
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index 131e6f1..a040bbe 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-//
+// Copyright (C) 2025 The Eigen Authors.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,8 @@ namespace Eigen {
 namespace internal {
 
 typedef __m512h Packet32h;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef __m256h Packet16h;
+typedef __m128h Packet8h;
 
 template <>
 struct is_arithmetic<Packet8h> {
@@ -60,11 +60,7 @@ struct packet_traits<half> : default_packet_traits {
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = 0,  // EIGEN_FAST_MATH,
-    HasBlend = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasBlend = 0
   };
 };
 
@@ -72,6 +68,7 @@ template <>
 struct unpacket_traits<Packet32h> {
   typedef Eigen::half type;
   typedef Packet16h half;
+  typedef Packet32s integer_packet;
   enum {
     size = 32,
     alignment = Aligned64,
@@ -85,6 +82,7 @@ template <>
 struct unpacket_traits<Packet16h> {
   typedef Eigen::half type;
   typedef Packet8h half;
+  typedef Packet16s integer_packet;
   enum {
     size = 16,
     alignment = Aligned32,
@@ -98,6 +96,7 @@ template <>
 struct unpacket_traits<Packet8h> {
   typedef Eigen::half type;
   typedef Packet8h half;
+  typedef Packet8s integer_packet;
   enum {
     size = 8,
     alignment = Aligned16,
@@ -107,13 +106,48 @@ struct unpacket_traits<Packet8h> {
   };
 };
 
+// Conversions
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
+
 // Memory functions
 
 // pset1
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
-  return _mm512_set1_ph(static_cast<_Float16>(from));
+  return _mm512_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pzero(const Packet32h& /*a*/) {
+  return _mm512_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
+  return _mm256_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
+  return _mm_setzero_ph();
 }
 
 // pset1frombits
@@ -122,18 +156,31 @@ EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
   return _mm512_castsi512_ph(_mm512_set1_epi16(from));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
+  return _mm256_castsi256_ph(_mm256_set1_epi16(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
+  return _mm_castsi128_ph(_mm_set1_epi16(from));
+}
+
 // pfirst
 
 template <>
 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  return half_impl::raw_uint16_to_half(
-      static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
-#else
-  Eigen::half dest[32];
-  _mm512_storeu_ph(dest, from);
-  return dest[0];
-#endif
+  return Eigen::half(_mm512_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return Eigen::half(_mm256_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return Eigen::half(_mm_cvtsh_h(from));
 }
 
 // pload
@@ -143,6 +190,16 @@ EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
+}
+
 // ploadu
 
 template <>
@@ -150,6 +207,16 @@ EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
+}
+
 // pstore
 
 template <>
@@ -157,6 +224,16 @@ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
 }
 
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
+}
+
 // pstoreu
 
 template <>
@@ -164,6 +241,16 @@ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
 }
 
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
+}
+
 // ploaddup
 template <>
 EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
@@ -173,6 +260,17 @@ EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
                                a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
+  __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
+  return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
+}
+
 // ploadquad
 template <>
 EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
@@ -182,6 +280,17 @@ EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
       a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
+  return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
+                       from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
 // pabs
 
 template <>
@@ -189,6 +298,16 @@ EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
   return _mm512_abs_ph(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
+  return _mm256_abs_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
+  return _mm_abs_ph(a);
+}
+
 // psignbit
 
 template <>
@@ -196,6 +315,16 @@ EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
   return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
+}
+
 // pmin
 
 template <>
@@ -203,6 +332,16 @@ EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32
   return _mm512_min_ph(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_min_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_min_ph(a, b);
+}
+
 // pmax
 
 template <>
@@ -210,13 +349,31 @@ EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32
   return _mm512_max_ph(a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_max_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_max_ph(a, b);
+}
+
 // plset
 template <>
 EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
-  return _mm512_add_ph(_mm512_set1_ph(a),
-                       _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f,
-                                     19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f,
-                                     7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+  return _mm512_add_ph(pset1<Packet32h>(a), _mm512_set_ph(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+                                                          16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
 }
 
 // por
@@ -226,6 +383,16 @@ EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
   return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pxor
 
 template <>
@@ -233,6 +400,16 @@ EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
   return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pand
 
 template <>
@@ -240,6 +417,16 @@ EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
   return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
 // pandnot
 
 template <>
@@ -247,6 +434,16 @@ EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
   return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
+}
+
 // pselect
 
 template <>
@@ -255,12 +452,36 @@ EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32
   return _mm512_mask_blend_ph(mask32, a, b);
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
+  return _mm256_mask_blend_ph(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
+  return _mm_mask_blend_ph(mask8, a, b);
+}
+
 // pcmp_eq
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
 // pcmp_le
@@ -268,7 +489,19 @@ EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
 // pcmp_lt
@@ -276,7 +509,19 @@ EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
 // pcmp_lt_or_nan
@@ -284,7 +529,19 @@ EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
 template <>
 EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
   __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu));
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
 }
 
 // padd
@@ -296,12 +553,12 @@ EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32
 
 template <>
 EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_add_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_add_ph(a, b);
 }
 
 // psub
@@ -313,12 +570,12 @@ EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32
 
 template <>
 EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_sub_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_sub_ph(a, b);
 }
 
 // pmul
@@ -330,12 +587,12 @@ EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_mul_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_mul_ph(a, b);
 }
 
 // pdiv
@@ -347,12 +604,13 @@ EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
+  return _mm256_div_ph(a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
+  return _mm_div_ph(a, b);
+  ;
 }
 
 // pround
@@ -361,14 +619,40 @@ template <>
 EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
   // Work-around for default std::round rounding mode.
 
-  // Mask for the sign bit
-  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
-  // The largest half-preicision float less than 0.5
+  // Mask for the sign bit.
+  const Packet32h signMask =
+      pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
   const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
 
   return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet16h signMask =
+      pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
 // print
 
 template <>
@@ -376,6 +660,16 @@ EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
   return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
 // pceil
 
 template <>
@@ -383,6 +677,16 @@ EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
   return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
 // pfloor
 
 template <>
@@ -390,47 +694,116 @@ EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
   return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// ptrunc
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ptrunc<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
 // predux
 template <>
 EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
-  return (half)_mm512_reduce_add_ph(a);
+  return half(_mm512_reduce_add_ph(a));
 }
 
 template <>
 EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
-  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
+  return half(_mm256_reduce_add_ph(a));
 }
 
 template <>
 EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
-  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
+  return half(_mm_reduce_add_ph(a));
 }
 
 // predux_half_dowto4
 template <>
 EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
-  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
+  const __m512i bits = _mm512_castph_si512(a);
+  Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
+  Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
+  return padd(lo, hi);
+}
 
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#else
-  Eigen::half data[32];
-  _mm512_storeu_ph(data, a);
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
+  Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
+  return padd(lo, hi);
+}
 
-  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
-  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
+// predux_max
 
-  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
-#endif
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_max_ph(a));
 }
 
-// predux_max
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_max_ph(a));
+}
 
 // predux_min
 
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_min_ph(a));
+}
+
 // predux_mul
 
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_mul_ph(a));
+}
+
 #ifdef EIGEN_VECTORIZE_FMA
 
 // pmadd
@@ -442,12 +815,12 @@ EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, cons
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fmadd_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fmadd_ph(a, b, c);
 }
 
 // pmsub
@@ -459,12 +832,12 @@ EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, cons
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fmsub_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fmsub_ph(a, b, c);
 }
 
 // pnmadd
@@ -476,12 +849,12 @@ EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, con
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fnmadd_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fnmadd_ph(a, b, c);
 }
 
 // pnmsub
@@ -493,12 +866,12 @@ EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, con
 
 template <>
 EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
-  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
+  return _mm256_fnmsub_ph(a, b, c);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
-  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
+  return _mm_fnmsub_ph(a, b, c);
 }
 
 #endif
@@ -507,35 +880,74 @@ EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const
 
 template <>
 EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
-  return _mm512_sub_ph(_mm512_set1_ph(0.0), a);
+  return _mm512_castsi512_ph(
+      _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
 }
 
-// pconj
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(
+      _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet32h pconj<Packet32h>(const Packet32h& a) {
-  return a;
+EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
 }
 
+// pconj
+
+// Nothing, packets are real.
+
 // psqrt
 
 template <>
 EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_sqrt_ph(a);
+  return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
+  return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
+  return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
 }
 
 // prsqrt
 
 template <>
 EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
-  return _mm512_rsqrt_ph(a);
+  return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
+  return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
+  return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
 }
 
 // preciprocal
 
 template <>
 EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
-  return _mm512_rcp_ph(a);
+  return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
+  return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
+  return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
 }
 
 // ptranspose
@@ -656,6 +1068,246 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
   a.packet[3] = _mm512_castsi512_ph(a3);
 }
 
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
+  __m256i a = _mm256_castph_si256(kernel.packet[0]);
+  __m256i b = _mm256_castph_si256(kernel.packet[1]);
+  __m256i c = _mm256_castph_si256(kernel.packet[2]);
+  __m256i d = _mm256_castph_si256(kernel.packet[3]);
+  __m256i e = _mm256_castph_si256(kernel.packet[4]);
+  __m256i f = _mm256_castph_si256(kernel.packet[5]);
+  __m256i g = _mm256_castph_si256(kernel.packet[6]);
+  __m256i h = _mm256_castph_si256(kernel.packet[7]);
+  __m256i i = _mm256_castph_si256(kernel.packet[8]);
+  __m256i j = _mm256_castph_si256(kernel.packet[9]);
+  __m256i k = _mm256_castph_si256(kernel.packet[10]);
+  __m256i l = _mm256_castph_si256(kernel.packet[11]);
+  __m256i m = _mm256_castph_si256(kernel.packet[12]);
+  __m256i n = _mm256_castph_si256(kernel.packet[13]);
+  __m256i o = _mm256_castph_si256(kernel.packet[14]);
+  __m256i p = _mm256_castph_si256(kernel.packet[15]);
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
+  kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
+  kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
+  kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
+  kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
+  kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
+  kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
+  kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
+  kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
+  kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
+  kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
+  kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
+  kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
+  kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
+  kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
+  kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j + 8] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][4 * i + 1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 8] = in[j][4 * i + 2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 12] = in[j][4 * i + 3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
+  __m128i a = _mm_castph_si128(kernel.packet[0]);
+  __m128i b = _mm_castph_si128(kernel.packet[1]);
+  __m128i c = _mm_castph_si128(kernel.packet[2]);
+  __m128i d = _mm_castph_si128(kernel.packet[3]);
+  __m128i e = _mm_castph_si128(kernel.packet[4]);
+  __m128i f = _mm_castph_si128(kernel.packet[5]);
+  __m128i g = _mm_castph_si128(kernel.packet[6]);
+  __m128i h = _mm_castph_si128(kernel.packet[7]);
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
+  kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
+  kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
+  kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
+  kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
+  kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
+  kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
+  kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
 // preverse
 
 template <>
@@ -665,6 +1317,20 @@ EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
                                a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm256_castsi256_ph(_mm256_insertf128_si256(
+      _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
+      _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
+}
+
 // pscatter
 
 template <>
@@ -677,191 +1343,68 @@ EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& fr
     to[stride * i] = aux[i];
   }
 }
-
-// pgather
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
-  return _mm512_castsi512_ph(_mm512_set_epi16(
-      from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
-      from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
-      from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
-      from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
-      from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
-      from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
-      from[1 * stride].x, from[0 * stride].x));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcos<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h psin<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog2<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h plog1p<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexp<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pexpm1<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h ptanh<Packet16h>(const Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pfrexp<Packet16h>(const Packet16h&, Packet16h&);
-template <>
-EIGEN_STRONG_INLINE Packet16h pldexp<Packet16h>(const Packet16h&, const Packet16h&);
-
-EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
-  return _mm512_castpd_ph(result);
-}
-
-EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
-  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
-  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
-}
-
-// psin
-template <>
-EIGEN_STRONG_INLINE Packet32h psin<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = psin(low);
-  Packet16h highOut = psin(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pcos
 template <>
-EIGEN_STRONG_INLINE Packet32h pcos<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pcos(low);
-  Packet16h highOut = pcos(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog
-template <>
-EIGEN_STRONG_INLINE Packet32h plog<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog(low);
-  Packet16h highOut = plog(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog2
-template <>
-EIGEN_STRONG_INLINE Packet32h plog2<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog2(low);
-  Packet16h highOut = plog2(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// plog1p
-template <>
-EIGEN_STRONG_INLINE Packet32h plog1p<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = plog1p(low);
-  Packet16h highOut = plog1p(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
-
-// pexp
-template <>
-EIGEN_STRONG_INLINE Packet32h pexp<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexp(low);
-  Packet16h highOut = pexp(high);
-
-  return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
 }
 
-// pexpm1
-template <>
-EIGEN_STRONG_INLINE Packet32h pexpm1<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = pexpm1(low);
-  Packet16h highOut = pexpm1(high);
-
-  return combine2Packet16h(lowOut, highOut);
-}
+// pgather
 
-// ptanh
 template <>
-EIGEN_STRONG_INLINE Packet32h ptanh<Packet32h>(const Packet32h& a) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h lowOut = ptanh(low);
-  Packet16h highOut = ptanh(high);
-
-  return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+  return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
+                       from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
+                       from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
+                       from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
+                       from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
-// pfrexp
 template <>
-EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1 = _mm256_undefined_si256();
-  Packet16h exp2 = _mm256_undefined_si256();
-
-  Packet16h lowOut = pfrexp(low, exp1);
-  Packet16h highOut = pfrexp(high, exp2);
-
-  exponent = combine2Packet16h(exp1, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+  return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
-// pldexp
 template <>
-EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
-  Packet16h low;
-  Packet16h high;
-  extract2Packet16h(a, low, high);
-
-  Packet16h exp1;
-  Packet16h exp2;
-  extract2Packet16h(exponent, exp1, exp2);
-
-  Packet16h lowOut = pldexp(low, exp1);
-  Packet16h highOut = pldexp(high, exp2);
-
-  return combine2Packet16h(lowOut, highOut);
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+  return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
+                    from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/AVX512/Reductions.h b/Eigen/src/Core/arch/AVX512/Reductions.h
new file mode 100644
index 0000000..f7b4c25
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/Reductions.h
@@ -0,0 +1,297 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX512_H
+#define EIGEN_REDUCTIONS_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet16i& a) {
+  return _mm512_reduce_add_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) {
+  return _mm512_reduce_mul_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) {
+  return _mm512_reduce_min_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) {
+  return _mm512_reduce_max_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
+  return _mm512_reduce_or_epi32(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) {
+  return _mm512_reduce_add_epi64(a);
+}
+
+#if EIGEN_COMP_MSVC
+// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
+//    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
+//    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
+// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
+// Fall back to a manual approach:
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {
+  Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
+  Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
+  return predux_mul(pmul(lane0, lane1));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
+  return _mm512_reduce_mul_epi64(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) {
+  return _mm512_reduce_min_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) {
+  return _mm512_reduce_max_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
+  return _mm512_reduce_or_epi64(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet16f& a) {
+  return _mm512_reduce_add_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) {
+  return _mm512_reduce_mul_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+  return _mm512_reduce_min_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+  return _mm512_reduce_max_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+  return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet8d& a) {
+  return _mm512_reduce_add_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) {
+  return _mm512_reduce_mul_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) {
+  return _mm512_reduce_min_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) {
+  return _mm512_reduce_max_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
+  return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16h -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet16h& from) {
+  return half(predux(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) {
+  return half(predux_mul(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) {
+  return half(predux_min(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_min<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet16h& from) {
+  return half(predux_min<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) {
+  return half(predux_max(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_max<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet16h& from) {
+  return half(predux_max<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16h& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16bf& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index 903bca5..c763b5f 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -206,7 +206,7 @@ EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_
 /**
  * GEMM like operation for trsm panel updates.
  * Computes: C -= A*B
- * K must be multipe of 4.
+ * K must be multiple of 4.
  *
  * Unrolls used are {1,2,4,8}x{U1,U2,U3};
  * For good performance we want K to be large with M/N relatively small, but also large enough
diff --git a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
index 4c6116c..3a5f68e 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
+++ b/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -28,7 +28,7 @@ EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) {
  *     func(startI,startJ)                                startJ = (startC)%(endJ)
  *                                                        func(...)
  *
- * The 1-D loop can be unrolled recursively by using enable_if and defining an auxillary function
+ * The 1-D loop can be unrolled recursively by using enable_if and defining an auxiliary function
  * with a template parameter used as a counter.
  *
  * template <endI, endJ, counter>
@@ -124,7 +124,7 @@ EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet8d, 8> &kernel) {
 }
 
 /***
- * Unrolls for tranposed C stores
+ * Unrolls for transposed C stores
  */
 template <typename Scalar>
 class trans {
@@ -134,7 +134,7 @@ class trans {
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
-   * Auxillary Functions for:
+   * Auxiliary Functions for:
    *  - storeC
    ***********************************
    */
@@ -213,11 +213,12 @@ class trans {
   }
 
   /**
-   * Transposes LxunrollN row major block of matrices stored EIGEN_AVX_MAX_NUM_ACC zmm registers to
+   * Transposes LxunrollN row major block of matrices stored `EIGEN_AVX_MAX_NUM_ACC` zmm registers to
    * "unrollN"xL ymm registers to be stored col-major into C.
    *
    *  For 8x48, the 8x48 block (row-major) is stored in zmm as follows:
    *
+   *  ```
    *  row0: zmm0 zmm1 zmm2
    *  row1: zmm3 zmm4 zmm5
    *    .
@@ -231,10 +232,10 @@ class trans {
    *    .
    *    .
    *  row7: zmm14 zmm15
-   *
+   * ```
    *
    * In general we will have {1,2,3} groups of avx registers each of size
-   * EIGEN_AVX_MAX_NUM_ROW. packetIndexOffset is used to select which "block" of
+   * `EIGEN_AVX_MAX_NUM_ROW`. packetIndexOffset is used to select which "block" of
    * avx registers are being transposed.
    */
   template <int64_t unrollN, int64_t packetIndexOffset>
@@ -285,7 +286,7 @@ class transB {
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
-   * Auxillary Functions for:
+   * Auxiliary Functions for:
    *  - loadB
    *  - storeB
    *  - loadBBlock
@@ -588,7 +589,7 @@ class trsm {
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
-   * Auxillary Functions for:
+   * Auxiliary Functions for:
    *  - loadRHS
    *  - storeRHS
    *  - divRHSByDiag
@@ -867,7 +868,7 @@ class gemm {
   static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
 
   /***********************************
-   * Auxillary Functions for:
+   * Auxiliary Functions for:
    *  - setzero
    *  - updateC
    *  - storeC
@@ -1101,7 +1102,7 @@ class gemm {
       }
     }
 
-    // We have updated all accumlators, time to load next set of B's
+    // We have updated all accumulators, time to load next set of B's
     EIGEN_IF_CONSTEXPR((startN == endN - 1) && (startM == endM - 1)) {
       gemm::template loadB<endM, endN, startK, endK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
     }
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index ccdb563..fc55fd8 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -37,12 +37,15 @@ struct type_casting_traits<double, int> : vectorized_type_casting_traits<double,
 template <>
 struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
 
-#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+
 template <>
 struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
 template <>
 struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
-#endif
 
 template <>
 struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
@@ -75,6 +78,39 @@ EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
   return _mm512_cvtps_pd(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVX512VL)
+  return _mm512_cvttpd_epi64(a);
+#else
+  constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
+                kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
+
+  const __m512i cst_one = _mm512_set1_epi64(1);
+  const __m512i cst_total_bits = _mm512_set1_epi64(kTotalBits);
+  const __m512i cst_bias = _mm512_set1_epi64(kBias);
+
+  __m512i a_bits = _mm512_castpd_si512(a);
+  // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
+  __m512i biased_e = _mm512_srli_epi64(_mm512_slli_epi64(a_bits, 1), kMantissaBits + 1);
+  __m512i e = _mm512_sub_epi64(biased_e, cst_bias);
+
+  // shift to the left by kExponentBits + 1 to clear the sign and exponent bits
+  __m512i shifted_mantissa = _mm512_slli_epi64(a_bits, kExponentBits + 1);
+  // shift to the right by kTotalBits - e to convert the significand to an integer
+  __m512i result_significand = _mm512_srlv_epi64(shifted_mantissa, _mm512_sub_epi64(cst_total_bits, e));
+
+  // add the implied bit
+  __m512i result_exponent = _mm512_sllv_epi64(cst_one, e);
+  // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
+  __m512i result = _mm512_add_epi64(result_significand, result_exponent);
+  // handle negative arguments
+  __mmask8 sign_mask = _mm512_cmplt_epi64_mask(a_bits, _mm512_setzero_si512());
+  result = _mm512_mask_sub_epi64(result, sign_mask, _mm512_setzero_si512(), result);
+  return result;
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
   return _mm512_cvtepi32_ps(a);
@@ -90,6 +126,19 @@ EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
   return _mm512_cvtepi32_pd(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVX512VL)
+  return _mm512_cvtepi64_pd(a);
+#else
+  EIGEN_ALIGN64 int64_t aux[8];
+  pstore(aux, a);
+  return _mm512_set_pd(static_cast<double>(aux[7]), static_cast<double>(aux[6]), static_cast<double>(aux[5]),
+                       static_cast<double>(aux[4]), static_cast<double>(aux[3]), static_cast<double>(aux[2]),
+                       static_cast<double>(aux[1]), static_cast<double>(aux[0]));
+#endif
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
   return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
@@ -124,6 +173,16 @@ EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f&
   return _mm512_castps_pd(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
+  return _mm512_castsi512_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd_si512(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
   return _mm512_castpd_ps(a);
@@ -178,18 +237,12 @@ EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i&
   return _mm512_castsi512_si128(a);
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512FP16
 template <>
 EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
   return _mm256_castsi256_si128(a);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
-  return _mm256_castsi256_si128(a);
-}
-
-#ifndef EIGEN_VECTORIZE_AVX512FP16
-
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
   return half2float(a);
@@ -202,6 +255,11 @@ EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
 
 #endif
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+  return _mm256_castsi256_si128(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
   return Bf16ToF32(a);
@@ -212,68 +270,6 @@ EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a)
   return F32ToBf16(a);
 }
 
-#ifdef EIGEN_VECTORIZE_AVX512FP16
-
-template <>
-EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
-  return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
-  return _mm256_castsi256_si128(preinterpret<Packet16h>(a));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
-  // Discard second-half of input.
-  Packet16h low = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
-  return _mm512_cvtxph_ps(_mm256_castsi256_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
-  __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(
-      result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
-  result = _mm512_insertf64x4(
-      result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
-  return _mm512_castpd_ph(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
-  // Discard second-half of input.
-  Packet8h low = _mm_castps_si128(_mm256_extractf32x4_ps(_mm256_castsi256_ps(a), 0));
-  return _mm256_cvtxph_ps(_mm_castsi128_ph(low));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
-  __m256d result = _mm256_undefined_pd();
-  result = _mm256_insertf64x2(result,
-                              _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
-  result = _mm256_insertf64x2(result,
-                              _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
-  return _mm256_castpd_si256(result);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
-  Packet8f full = _mm256_cvtxph_ps(_mm_castsi128_ph(a));
-  // Discard second-half of input.
-  return _mm256_extractf32x4_ps(full, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
-  __m256 result = _mm256_undefined_ps();
-  result = _mm256_insertf128_ps(result, a, 0);
-  result = _mm256_insertf128_ps(result, b, 1);
-  return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT);
-}
-
-#endif
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
new file mode 100644
index 0000000..f06f13d
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_FP16_AVX512_H
+#define EIGEN_TYPE_CASTING_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE Packet32s preinterpret<Packet32s, Packet32h>(const Packet32h& a) {
+  return _mm512_castph_si512(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s preinterpret<Packet16s, Packet16h>(const Packet16h& a) {
+  return _mm256_castph_si256(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8h>(const Packet8h& a) {
+  return _mm_castph_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preinterpret<Packet32h, Packet32s>(const Packet32s& a) {
+  return _mm512_castsi512_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet16s>(const Packet16s& a) {
+  return _mm256_castsi256_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet8s>(const Packet8s& a) {
+  return _mm_castsi128_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
+  // Discard second-half of input.
+  Packet16h low = _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
+  return _mm512_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
+  // Discard second-half of input.
+  Packet8h low = _mm_castps_ph(_mm256_extractf32x4_ps(_mm256_castph_ps(a), 0));
+  return _mm256_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
+  Packet8f full = _mm256_cvtxph_ps(a);
+  // Discard second-half of input.
+  return _mm256_extractf32x4_ps(full, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
+  __m512 result = _mm512_castsi512_ps(_mm512_castsi256_si512(_mm256_castph_si256(_mm512_cvtxps_ph(a))));
+  result = _mm512_insertf32x8(result, _mm256_castph_ps(_mm512_cvtxps_ph(b)), 1);
+  return _mm512_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castph_si128(_mm256_cvtxps_ph(a))));
+  result = _mm256_insertf32x4(result, _mm_castph_ps(_mm256_cvtxps_ph(b)), 1);
+  return _mm256_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castps_si128(a)));
+  result = _mm256_insertf128_ps(result, b, 1);
+  return _mm256_cvtxps_ph(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pcast<Packet32h, Packet32s>(const Packet32h& a) {
+  return _mm512_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s pcast<Packet16h, Packet16s>(const Packet16h& a) {
+  return _mm256_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8h, Packet8s>(const Packet8h& a) {
+  return _mm_cvtph_epi16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet32s, Packet32h>(const Packet32s& a) {
+  return _mm512_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16s, Packet16h>(const Packet16s& a) {
+  return _mm256_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8s, Packet8h>(const Packet8s& a) {
+  return _mm_cvtepi16_ph(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_FP16_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 0252efa..d6df59a 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -18,19 +18,28 @@ namespace Eigen {
 
 namespace internal {
 
-static Packet4ui p4ui_CONJ_XOR =
-    vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);  //{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+inline Packet4ui p4ui_CONJ_XOR() {
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);  //{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+}
 #ifdef EIGEN_VECTORIZE_VSX
 #if defined(_BIG_ENDIAN)
-static Packet2ul p2ul_CONJ_XOR1 =
-    (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 =
-    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
 #else
-static Packet2ul p2ul_CONJ_XOR1 =
-    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 =
-    (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
 #endif
 #endif
 
@@ -50,7 +59,7 @@ struct Packet2cf {
     v1 = vec_madd(v1, b.v, p4f_ZERO);
     // multiply a_im * b and get the conjugate result
     v2 = vec_madd(v2, b.v, p4f_ZERO);
-    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
     // permute back to a proper order
     v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
 
@@ -266,7 +275,7 @@ EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
-  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
 }
 
 template <>
@@ -399,7 +408,7 @@ struct Packet1cd {
     // multiply a_im * b and get the conjugate result
     v2 = vec_madd(a_im, b.v, p2d_ZERO);
     v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1()));
 
     return Packet1cd(padd<Packet2d>(v1, v2));
   }
@@ -543,7 +552,7 @@ EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2)));
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2())));
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index 94306da..4725526 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -1590,7 +1590,7 @@ EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Pack
   pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
 }
 
-// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
+// Load a PacketBlock, the N parameters make tuning gemm easier so we can add more accumulators as needed.
 //
 // full = operate (load) on the entire PacketBlock or only half
 template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
@@ -3155,7 +3155,7 @@ void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16*
 
 #undef MAX_BFLOAT16_ACC_VSX
 
-#include "MatrixVectorProduct.h"
+#include "MatrixVectorProduct.inc"
 
 /************************************
  * ppc64le template specializations *
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
similarity index 100%
rename from Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
rename to Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index a4b134c..eefe326 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -94,9 +94,7 @@ static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1
 
 static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
 static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
-#ifndef _ARCH_PWR9
 static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
-#endif
 
 #ifdef _BIG_ENDIAN
 static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
@@ -187,7 +185,10 @@ struct packet_traits<float> : default_packet_traits {
     HasLog = 1,
     HasExp = 1,
 #ifdef EIGEN_VECTORIZE_VSX
+    HasCmp = 1,
+    HasPow = 1,
     HasSqrt = 1,
+    HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
 #else
@@ -195,17 +196,13 @@ struct packet_traits<float> : default_packet_traits {
 #endif
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasRint = 1,
+    HasErfc = EIGEN_FAST_MATH,
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
     HasTanh = 0,
     HasErf = 0,
-    HasRint = 0,
 #endif
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasNegate = 1,
     HasBlend = 1
   };
@@ -237,17 +234,12 @@ struct packet_traits<bfloat16> : default_packet_traits {
 #else
     HasRsqrt = 0,
 #endif
-    HasRint = 1,
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
-    HasRint = 0,
 #endif
     HasTanh = 0,
     HasErf = 0,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasNegate = 1,
     HasBlend = 1
   };
@@ -435,55 +427,6 @@ struct unpacket_traits<Packet8bf> {
     masked_store_available = false
   };
 };
-inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) {
-  union {
-    Packet16c v;
-    signed char n[16];
-  } vt;
-  vt.v = v;
-  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) {
-  union {
-    Packet16uc v;
-    unsigned char n[16];
-  } vt;
-  vt.v = v;
-  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
-  union {
-    Packet4f v;
-    float n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
-  union {
-    Packet4i v;
-    int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
-
-inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
-  union {
-    Packet4ui v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}
 
 template <typename Packet>
 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
@@ -1508,6 +1451,10 @@ template <>
 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   return vec_floor(a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return vec_trunc(a);
+}
 #ifdef EIGEN_VECTORIZE_VSX
 template <>
 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
@@ -1521,16 +1468,13 @@ EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
 
 template <typename Packet>
 EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
   EIGEN_DEBUG_UNALIGNED_LOAD
+#if defined(EIGEN_VECTORIZE_VSX)
   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char*)from);   // most significant quadword
-  LSQ = vec_ld(15, (unsigned char*)from);  // least significant quadword
-  mask = vec_lvsl(0, from);                // create the permute mask
+  Packet16uc MSQ = vec_ld(0, (unsigned char*)from);   // most significant quadword
+  Packet16uc LSQ = vec_ld(15, (unsigned char*)from);  // least significant quadword
+  Packet16uc mask = vec_lvsl(0, from);                // create the permute mask
   // TODO: Add static_cast here
   return (Packet)vec_perm(MSQ, LSQ, mask);  // align the data
 #endif
@@ -1740,7 +1684,7 @@ EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from)
 template <typename Packet>
 EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
   EIGEN_DEBUG_UNALIGNED_STORE
-#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
+#if defined(EIGEN_VECTORIZE_VSX)
   vec_xst(from, 0, to);
 #else
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -1928,19 +1872,11 @@ EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
-#ifdef _ARCH_PWR9
-  return vec_revb(a);
-#else
   return vec_perm(a, a, p16uc_REVERSE8);
-#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
-#ifdef _ARCH_PWR9
-  return vec_revb(a);
-#else
   return vec_perm(a, a, p16uc_REVERSE8);
-#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
@@ -2084,7 +2020,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
   input = padd<Packet4ui>(input, rounding_bias);
 
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
-#ifdef _ARCH_PWR9
+#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
   Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
   input = vec_sel(input, p4ui_nan, nan_selector);
 
@@ -2131,7 +2067,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
 /**
  * Pack the high portion of two float Packets into one bfloat16 Packet
  *
- * @param lohi to expect either a low & high OR odd & even order
+ * @tparam lohi to expect either a low & high OR odd & even order
  */
 template <bool lohi>
 EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
@@ -2178,7 +2114,7 @@ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
 /**
  * Convert and pack two float Packets into one bfloat16 Packet
  *
- * @param lohi to expect either a low & high OR odd & even order
+ * @tparam lohi to expect either a low & high OR odd & even order
  */
 template <bool lohi = true>
 EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
@@ -2193,7 +2129,7 @@ EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
   Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
   Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
 
-#ifdef _ARCH_PWR9
+#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
   Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
   Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
   Packet8us nan_selector =
@@ -2325,6 +2261,11 @@ EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
   return pldexp_generic(a, exponent);
@@ -2374,6 +2315,10 @@ template <>
 EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
 }
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
+}
 #ifdef EIGEN_VECTORIZE_VSX
 template <>
 EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
@@ -2393,6 +2338,44 @@ EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, cons
   return F32ToBf16(pmadd_even, pmadd_odd);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
@@ -2449,13 +2432,11 @@ EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
 
 template <>
 EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
-  Packet4i sum;
-  sum = vec_sums(a, p4i_ZERO);
-#ifdef _BIG_ENDIAN
-  sum = vec_sld(sum, p4i_ZERO, 12);
-#else
-  sum = vec_sld(p4i_ZERO, sum, 4);
-#endif
+  Packet4i b, sum;
+  b = vec_sld(a, a, 8);
+  sum = a + b;
+  b = vec_sld(sum, sum, 4);
+  sum += b;
   return pfirst(sum);
 }
 
@@ -3188,21 +3169,23 @@ struct packet_traits<double> : default_packet_traits {
     HasMin = 1,
     HasMax = 1,
     HasAbs = 1,
-    HasSin = 0,
-    HasCos = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasATanh = 1,
     HasATan = 0,
     HasLog = 0,
+    HasCmp = 1,
     HasExp = 1,
     HasSqrt = 1,
+    HasCbrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
 #else
     HasRsqrt = 0,
 #endif
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
     HasNegate = 1,
     HasBlend = 1
   };
@@ -3211,6 +3194,7 @@ struct packet_traits<double> : default_packet_traits {
 template <>
 struct unpacket_traits<Packet2d> {
   typedef double type;
+  typedef Packet2l integer_packet;
   enum {
     size = 2,
     alignment = Aligned16,
@@ -3220,6 +3204,18 @@ struct unpacket_traits<Packet2d> {
   };
   typedef Packet2d half;
 };
+template <>
+struct unpacket_traits<Packet2l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = false,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
 inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
   union {
@@ -3269,6 +3265,11 @@ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   Packet2d v = {from, from};
   return v;
 }
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  Packet2l v = {from, from};
+  return v;
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
@@ -3390,6 +3391,18 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
   return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
 }
 template <>
+#ifdef __POWER8_VECTOR__
+EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
+  return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
+}
+#else
+EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
+  Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
+  Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
+  return reinterpret_cast<Packet2l>(pand(halves, flipped));
+}
+#endif
+template <>
 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
   Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
   return vec_nor(c, c);
@@ -3434,6 +3447,10 @@ EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
   return vec_floor(a);
 }
 template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return vec_trunc(a);
+}
+template <>
 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
   Packet2d res;
 
diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
index fdabeb9..439339e 100644
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@@ -129,30 +129,20 @@ EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a)
 }
 
 #ifdef EIGEN_VECTORIZE_VSX
-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers.
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 template <>
 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_STRICT_AT_LEAST(7, 1, 0)
-  return vec_cts(x, 0);  // TODO: check clang version.
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = {static_cast<long long>(tmp[0]), static_cast<long long>(tmp[1])};
-  return l;
-#endif
+  EIGEN_ALIGN_MAX double dtmp[2];
+  pstore(dtmp, x);
+  EIGEN_ALIGN_MAX long long itmp[2] = {static_cast<long long>(dtmp[0]), static_cast<long long>(dtmp[1])};
+  return vec_xl(0, itmp);
 }
 
 template <>
 inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
-  unsigned long long tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2d d = {static_cast<double>(tmp[0]), static_cast<double>(tmp[1])};
-  return d;
+  EIGEN_ALIGN_MAX long long itmp[2];
+  vec_xst(x, 0, itmp);
+  EIGEN_ALIGN_MAX double dtmp[2] = {static_cast<double>(itmp[0]), static_cast<double>(itmp[1])};
+  return pload<Packet2d>(dtmp);
 }
 #endif
 
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index be44909..b93c4bc 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -139,8 +139,11 @@ struct numeric_limits_bfloat16_impl {
   static EIGEN_CONSTEXPR const bool has_infinity = true;
   static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
   static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+  EIGEN_DIAGNOSTICS(push)
+  EIGEN_DISABLE_DEPRECATED_WARNING
   static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
   static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+  EIGEN_DIAGNOSTICS(pop)
   static EIGEN_CONSTEXPR const std::float_round_style round_style = std::numeric_limits<float>::round_style;
   static EIGEN_CONSTEXPR const bool is_iec559 = true;
   // The C++ standard defines this as "true if the set of values representable
@@ -187,10 +190,13 @@ template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_quiet_NaN;
 template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_signaling_NaN;
+EIGEN_DIAGNOSTICS(push)
+EIGEN_DISABLE_DEPRECATED_WARNING
 template <typename T>
 EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl<T>::has_denorm;
 template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_denorm_loss;
+EIGEN_DIAGNOSTICS(pop)
 template <typename T>
 EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl<T>::round_style;
 template <typename T>
@@ -607,6 +613,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
   return numext::bit_cast<bfloat16>(x);
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { return bfloat16(::expf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp2(const bfloat16& a) { return bfloat16(::exp2f(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { return bfloat16(::logf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { return bfloat16(numext::log1p(float(a))); }
@@ -637,6 +644,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { return bfloat16(::ceilf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { return bfloat16(::rintf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { return bfloat16(::roundf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 trunc(const bfloat16& a) { return bfloat16(::truncf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::fmodf(float(a), float(b)));
 }
@@ -665,6 +673,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfl
   return bfloat16(::fmaxf(f1, f2));
 }
 
+EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) {
+  // Emulate FMA via float.
+  return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
+}
+
 #ifndef EIGEN_NO_IO
 EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
   os << static_cast<float>(v);
@@ -755,6 +768,37 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat1
   return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
 }
 
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 nextafter(const bfloat16& from, const bfloat16& to) {
+  if (numext::isnan EIGEN_NOT_A_MACRO(from)) {
+    return from;
+  }
+  if (numext::isnan EIGEN_NOT_A_MACRO(to)) {
+    return to;
+  }
+  if (from == to) {
+    return to;
+  }
+  uint16_t from_bits = numext::bit_cast<uint16_t>(from);
+  bool from_sign = from_bits >> 15;
+  // Whether we are adjusting toward the infinity with the same sign as from.
+  bool toward_inf = (to > from) == !from_sign;
+  if (toward_inf) {
+    ++from_bits;
+  } else if ((from_bits & 0x7fff) == 0) {
+    // Adjusting away from inf, but from is zero, so just toggle the sign.
+    from_bits ^= 0x8000;
+  } else {
+    --from_bits;
+  }
+  return numext::bit_cast<bfloat16>(from_bits);
+}
+
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template<>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 madd<Eigen::bfloat16>(const Eigen::bfloat16& x, const Eigen::bfloat16& y, const Eigen::bfloat16& z) {
+  return Eigen::bfloat16(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
 }  // namespace numext
 }  // namespace Eigen
 
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 78dbf20..a46a8ef 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -42,6 +42,134 @@ struct make_integer<bfloat16> {
   typedef numext::int16_t type;
 };
 
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
 template <typename Packet>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
@@ -59,7 +187,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Pac
   static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
                        ExponentBits = TotalBits - MantissaBits - 1;
 
-  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
+  constexpr ScalarUI scalar_sign_mantissa_mask =
       ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
   const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
   const Packet half = pset1<Packet>(Scalar(0.5));
@@ -68,7 +196,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Pac
 
   // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
   const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
-  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
+  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
   // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
   const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
   const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
@@ -130,7 +258,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, con
   PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
   Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
   Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
-  b = psub(psub(psub(e, b), b), b);                                                   // e - 3b
+  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
   c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
   out = pmul(out, c);
   return out;
@@ -146,22 +274,156 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, con
 //
 // Assumes IEEE floating point format
 template <typename Packet>
-struct pldexp_fast_impl {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<PacketI>::type ScalarI;
   static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
                        ExponentBits = TotalBits - MantissaBits - 1;
 
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) {
-    const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
-    const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
-    // restrict biased exponent between 0 and 255 for float.
-    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
-    // return a * (2^e)
-    return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
-  }
-};
+  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
+  // restrict biased exponent between 0 and 255 for float.
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
+  // return a * (2^e)
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
+
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+                                                                                      const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+  Packet num = psub(x_k_cb, y);
+  Packet r = pdiv(num, denom);
+  return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Extract the significant s in the range [0.5,1) and exponent e, such that
+  // x = 2^e * s.
+  Packet e, s;
+  s = pfrexp(x, e);
+
+  // Split the exponent into a part divisible by 3 and the remainder.
+  // e = 3*e_div3 + e_mod3.
+  constexpr Scalar kOneThird = Scalar(1) / 3;
+  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+  // Replace s by y = (s * 2^e_mod3).
+  return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+                                                                                       const Packet& abs_root) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  // Set sign.
+  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+  const Packet x_sign = pand(sign_mask, x);
+  Packet root = por(x_sign, abs_root);
+
+  // Pass non-finite and zero values of x straight through.
+  const Packet is_not_finite = por(pisinf(x), pisnan(x));
+  const Packet is_zero = pcmp_eq(pzero(x), x);
+  const Packet use_x = por(is_not_finite, is_zero);
+  return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+//   x = s * 2^e.
+//
+// We can then write the cube root as
+//
+//   x^(1/3) = 2^(e/3) * s^(1/3)
+//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 5.22e-3.
+  // The polynomial was computed using Rminimax.
+  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+                             3.408401906490325927734375e-01f};
+  Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+  // Take one step of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3)
+  r = pldexp_fast(r, e_div3);
+
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 0.016.
+  // The polynomial was computed using Rminimax.
+  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+                              1.072314636518546304699839311069808900356292724609375e+00,
+                              3.81249427609571867048288140722434036433696746826171875e-01};
+  Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+  // Take two steps of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3).
+  r = pldexp_fast(r, e_div3);
+  return cbrt_special_cases_and_sign(x, r);
+}
 
 // Natural or base 2 logarithm.
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -193,21 +455,14 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const
   e = psub(e, pand(cst_1, mask));
   x = padd(x, tmp);
 
-  // Polynomial coefficients for rational (3,3) r(x) = p(x)/q(x)
+  // Polynomial coefficients for rational r(x) = p(x)/q(x)
   // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
-  const Packet cst_p1 = pset1<Packet>(1.0000000190281136f);
-  const Packet cst_p2 = pset1<Packet>(1.0000000190281063f);
-  const Packet cst_p3 = pset1<Packet>(0.18256296349849254f);
-  const Packet cst_q1 = pset1<Packet>(1.4999999999999927f);
-  const Packet cst_q2 = pset1<Packet>(0.59923249590823520f);
-  const Packet cst_q3 = pset1<Packet>(0.049616247954120038f);
-
-  Packet p = pmadd(x, cst_p3, cst_p2);
-  p = pmadd(x, p, cst_p1);
+  constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
+  constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
+
+  Packet p = ppolevl<Packet, 2>::run(x, alpha);
   p = pmul(x, p);
-  Packet q = pmadd(x, cst_q3, cst_q2);
-  q = pmadd(x, q, cst_q1);
-  q = pmadd(x, q, cst_1);
+  Packet q = ppolevl<Packet, 3>::run(x, beta);
   x = pdiv(p, q);
 
   // Add the logarithm of the exponent back to the result of the interpolation.
@@ -348,7 +603,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Pa
     See: http://www.plunk.org/~hatch/rightway.php
  */
 template <typename Packet>
-Packet generic_plog1p(const Packet& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type ScalarType;
   const Packet one = pset1<Packet>(ScalarType(1));
   Packet xp1 = padd(x, one);
@@ -363,7 +618,7 @@ Packet generic_plog1p(const Packet& x) {
     See: http://www.plunk.org/~hatch/rightway.php
  */
 template <typename Packet>
-Packet generic_expm1(const Packet& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type ScalarType;
   const Packet one = pset1<Packet>(ScalarType(1));
   const Packet neg_one = pset1<Packet>(ScalarType(-1));
@@ -393,6 +648,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
   const Packet cst_half = pset1<Packet>(0.5f);
   const Packet cst_exp_hi = pset1<Packet>(88.723f);
   const Packet cst_exp_lo = pset1<Packet>(-104.f);
+  const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
   const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
@@ -428,7 +684,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
   y = pmadd(r2, y, p_low);
 
   // Return 2^m * exp(r).
-  // TODO: replace pldexp with faster implementation since y in [-1, 1).
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 87, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(y, m), _x);
+  }
   return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
 }
 
@@ -441,8 +702,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
   const Packet cst_half = pset1<Packet>(0.5);
 
   const Packet cst_exp_hi = pset1<Packet>(709.784);
-  const Packet cst_exp_lo = pset1<Packet>(-709.784);
-
+  const Packet cst_exp_lo = pset1<Packet>(-745.519);
+  const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
   const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
@@ -495,7 +756,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
 
   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
   // non-finite values in the input.
-  // TODO: replace pldexp with faster implementation since x in [-1, 1).
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 708, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(x, fx), _x);
+  }
   return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
 }
 
@@ -696,6 +962,174 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Pack
   return psincos_float<false>(x);
 }
 
+// Trigonometric argument reduction for double for inputs smaller than 15.
+// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
+  // Pi/2 split into 2 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q, x);
+  t = pmadd(cst_pio2_b, q, t);
+  return t;
+}
+
+// Trigonometric argument reduction for double for inputs smaller than 1e14.
+// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
+  // Pi/2 split into 4 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
+  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q_high, x);
+  t = pmadd(cst_pio2_a, q_low, t);
+  t = pmadd(cst_pio2_b, q_high, t);
+  t = pmadd(cst_pio2_b, q_low, t);
+  t = pmadd(cst_pio2_c, q_high, t);
+  t = pmadd(cst_pio2_c, q_low, t);
+  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
+  return t;
+}
+
+template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
+
+  // If the argument is smaller than this value, use a simpler argument reduction
+  const double small_th = 15;
+  // If the argument is bigger than this value, use the non-vectorized std version
+  const double huge_th = 1e14;
+
+  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
+  // Integer Packet constants
+  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
+  // Constant for splitting
+  const Packet cst_split = pset1<Packet>(1 << 24);
+
+  Packet x_abs = pabs(x);
+
+  // Scale x by 2/Pi
+  PacketI q_int;
+  Packet s;
+
+  // TODO Implement huge angle argument reduction
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
+    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
+    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
+    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
+    Packet q_low = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_medium_double(x_abs, q_high, q_low);
+  } else {
+    Packet qval_noround = pmul(x_abs, cst_2oPI);
+    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
+    Packet q = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_small_double(x_abs, q);
+  }
+
+  // All the upcoming approximating polynomials have even exponents
+  Packet ss = pmul(s, s);
+
+  // Padé approximant of cos(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
+  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    cosf = @(x) cos(x);
+  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
+  Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
+  Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
+  Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
+  Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
+  Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
+  Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
+  Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
+  Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
+  Packet scos = pdiv(sc4_num, sc4_denum);
+
+  // Padé approximant of sin(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
+  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    sinf = @(x) sin(x);
+  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
+  Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
+  Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
+  Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
+  Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
+  Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
+  Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
+  Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
+  Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
+  Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
+
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
+
+  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
+  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
+  Packet sign_bit, sFinalRes;
+  if (ComputeBoth) {
+    Packet peven = peven_mask(x);
+    sign_bit = pselect((s), sign_sin, sign_cos);
+    sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
+  } else {
+    sign_bit = ComputeSine ? sign_sin : sign_cos;
+    sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
+  }
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+  sFinalRes = pxor(sFinalRes, sign_bit);
+
+  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
+  // using std::sin and std::cos
+  // TODO Remove it when huge angle argument reduction is implemented
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
+    pstoreu(x_cpy, x);
+    pstoreu(sincos_vals, sFinalRes);
+    for (int k = 0; k < PacketSize; ++k) {
+      double val = x_cpy[k];
+      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
+        if (ComputeBoth)
+          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
+        else
+          sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
+      }
+    }
+    sFinalRes = ploadu<Packet>(sincos_vals);
+  }
+  return sFinalRes;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
+  return psincos_double<true>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
+  return psincos_double<false>(x);
+}
+
 // Generic implementation of acos(x).
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
@@ -751,13 +1185,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
   const Packet cst_one = pset1<Packet>(1.0f);
   const Packet cst_two = pset1<Packet>(2.0f);
   const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
-  // even terms only.
-  const Packet p9 = pset1<Packet>(5.08838854730129241943359375e-2f);
-  const Packet p7 = pset1<Packet>(3.95139865577220916748046875e-2f);
-  const Packet p5 = pset1<Packet>(7.550220191478729248046875e-2f);
-  const Packet p3 = pset1<Packet>(0.16664917767047882080078125f);
-  const Packet p1 = pset1<Packet>(1.00000011920928955078125f);
 
   const Packet abs_x = pabs(x_in);
   const Packet sign_mask = pandnot(x_in, abs_x);
@@ -773,13 +1200,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
   const Packet x = pselect(large_mask, x_large, abs_x);
   const Packet x2 = pmul(x, x);
 
-  // Compute polynomial.
-  // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
-
-  Packet p = pmadd(p9, x2, p7);
-  p = pmadd(p, x2, p5);
-  p = pmadd(p, x2, p3);
-  p = pmadd(p, x2, p1);
+  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+  // even terms only.
+  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
+                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
   p = pmul(p, x);
 
   const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
@@ -790,228 +1215,239 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
   return por(invalid_mask, p);
 }
 
+template <typename Scalar>
+struct patan_reduced {
+  template <typename Packet>
+  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
+};
+
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
+  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
+                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
+                              3.3004361289279920e-01};
+
+  constexpr double beta[] = {
+      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
+      9.3705509168587852e-01, 3.3004361289279920e-01};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 6>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
 // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
+template <>
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_float(const Packet& x) {
-  const Packet q0 = pset1<Packet>(-0.3333314359188079833984375f);
-  const Packet q2 = pset1<Packet>(0.19993579387664794921875f);
-  const Packet q4 = pset1<Packet>(-0.14209578931331634521484375f);
-  const Packet q6 = pset1<Packet>(0.1066047251224517822265625f);
-  const Packet q8 = pset1<Packet>(-7.5408883392810821533203125e-2f);
-  const Packet q10 = pset1<Packet>(4.3082617223262786865234375e-2f);
-  const Packet q12 = pset1<Packet>(-1.62907354533672332763671875e-2f);
-  const Packet q14 = pset1<Packet>(2.90188402868807315826416015625e-3f);
-
-  // Approximate atan(x) by a polynomial of the form
-  //   P(x) = x + x^3 * Q(x^2),
-  // where Q(x^2) is a 7th order polynomial in x^2.
-  // We evaluate even and odd terms in x^2 in parallel
-  // to take advantage of instruction level parallelism
-  // and hardware with multiple FMA units.
-
-  // note: if x == -0, this returns +0
-  const Packet x2 = pmul(x, x);
-  const Packet x4 = pmul(x2, x2);
-  Packet q_odd = pmadd(q14, x4, q10);
-  Packet q_even = pmadd(q12, x4, q8);
-  q_odd = pmadd(q_odd, x4, q6);
-  q_even = pmadd(q_even, x4, q4);
-  q_odd = pmadd(q_odd, x4, q2);
-  q_even = pmadd(q_even, x4, q0);
-  const Packet q = pmadd(q_odd, x2, q_even);
-  return pmadd(q, pmul(x, x2), x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
+  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
+
+  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
+                            8.109951019287109375e-01f};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 3>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x_in) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
-  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
 
-  const Packet cst_signmask = pset1<Packet>(-0.0f);
-  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
 
   //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
   //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
-  //            calculated using Sollya.
+  //            calculated using Rminimax.
 
   const Packet abs_x = pabs(x_in);
   const Packet x_signmask = pand(x_in, cst_signmask);
   const Packet large_mask = pcmp_lt(cst_one, abs_x);
   const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
-  const Packet p = patan_reduced_float(x);
+  const Packet p = patan_reduced<Scalar>::run(x);
   // Apply transformations according to the range reduction masks.
   Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
   // Return correct sign
   return pxor(result, x_signmask);
 }
 
-// Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
-// with 2 ulp accuracy.
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_double(const Packet& x) {
-  const Packet q0 = pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
-  const Packet q2 = pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
-  const Packet q4 = pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
-  const Packet q6 = pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
-  const Packet q8 = pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
-  const Packet q10 = pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
-  const Packet q12 = pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
-  const Packet q14 = pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
-  const Packet q16 = pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
-  const Packet q18 = pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
-
-  // Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
-  //   P(x) = x + x^3 * Q(x^2),
-  // where Q(x^2) is a 9th order polynomial in x^2.
-  // We evaluate even and odd terms in x^2 in parallel
-  // to take advantage of instruction level parallelism
-  // and hardware with multiple FMA units.
-  const Packet x2 = pmul(x, x);
-  const Packet x4 = pmul(x2, x2);
-  Packet q_odd = pmadd(q18, x4, q14);
-  Packet q_even = pmadd(q16, x4, q12);
-  q_odd = pmadd(q_odd, x4, q10);
-  q_even = pmadd(q_even, x4, q8);
-  q_odd = pmadd(q_odd, x4, q6);
-  q_even = pmadd(q_even, x4, q4);
-  q_odd = pmadd(q_odd, x4, q2);
-  q_even = pmadd(q_even, x4, q0);
-  const Packet p = pmadd(q_odd, x2, q_even);
-  return pmadd(p, pmul(x, x2), x);
-}
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
 
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x_in) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(8.01773357391357422f);
+  const T minus_clamp = pset1<T>(-8.01773357391357422f);
+#else
+  const T plus_clamp = pset1<T>(7.90738964080810547f);
+  const T minus_clamp = pset1<T>(-7.90738964080810547f);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
 
-  constexpr double kPiOverTwo = static_cast<double>(EIGEN_PI / 2);
-  constexpr double kPiOverFour = static_cast<double>(EIGEN_PI / 4);
-  constexpr double kTanPiOverEight = 0.4142135623730950488016887;
-  constexpr double kTan3PiOverEight = 2.4142135623730950488016887;
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
+  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
+  //   --output=tanhf.sollya --dispCoeff="dec"
 
-  const Packet cst_signmask = pset1<Packet>(-0.0);
-  const Packet cst_one = pset1<Packet>(1.0);
-  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-  const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
-  const Packet cst_large = pset1<Packet>(kTan3PiOverEight);
-  const Packet cst_medium = pset1<Packet>(kTanPiOverEight);
-
-  // Use the same range reduction strategy (to [0:tan(pi/8)]) as the
-  // Cephes library:
-  //   "Large": For x >= tan(3*pi/8), use atan(1/x) = pi/2 - atan(x).
-  //   "Medium": For x in [tan(pi/8) : tan(3*pi/8)),
-  //             use atan(x) = pi/4 + atan((x-1)/(x+1)).
-  //   "Small": For x < tan(pi/8), approximate atan(x) directly by a polynomial
-  //            calculated using Sollya.
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
 
-  const Packet abs_x = pabs(x_in);
-  const Packet x_signmask = pand(x_in, cst_signmask);
-  const Packet large_mask = pcmp_lt(cst_large, abs_x);
-  const Packet medium_mask = pandnot(pcmp_lt(cst_medium, abs_x), large_mask);
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
 
-  Packet x = abs_x;
-  x = pselect(large_mask, preciprocal(abs_x), x);
-  x = pselect(medium_mask, pdiv(psub(abs_x, cst_one), padd(abs_x, cst_one)), x);
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
 
-  // Compute approximation of p ~= atan(x') where x' is the argument reduced to
-  // [0:tan(pi/8)].
-  Packet p = patan_reduced_double(x);
+  T p = ppolevl<T, 3>::run(x2, alpha);
+  T q = ppolevl<T, 4>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
 
-  // Apply transformations according to the range reduction masks.
-  p = pselect(large_mask, psub(cst_pi_over_two, p), p);
-  p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
-  // Return the correct sign
-  return pxor(p, x_signmask);
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
 }
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
-    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    This uses a 19/18-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
     outside of which tanh(x) = +/-1 in single precision. The input is clamped
     to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
-    the approximation tanh(x) ~= x is used for better accuracy as x tends to zero.
+    the approximation evaluates to exactly 1.
 
     This implementation works on both scalars and packets.
 */
 template <typename T>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
-  // Clamp the inputs to the range [-c, c]
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
 #ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(7.99881172180175781f);
-  const T minus_clamp = pset1<T>(-7.99881172180175781f);
+  const T plus_clamp = pset1<T>(17.6610191624600077);
+  const T minus_clamp = pset1<T>(-17.6610191624600077);
 #else
-  const T plus_clamp = pset1<T>(7.90531110763549805f);
-  const T minus_clamp = pset1<T>(-7.90531110763549805f);
+  const T plus_clamp = pset1<T>(17.714196154005176);
+  const T minus_clamp = pset1<T>(-17.714196154005176);
 #endif
-  const T tiny = pset1<T>(0.0004f);
   const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
+
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
+  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
+  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
+
   // The monomial coefficients of the numerator polynomial (odd).
-  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
-  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
-  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
-  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
-  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
-  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
-  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
+                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
+                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
 
   // The monomial coefficients of the denominator polynomial (even).
-  const T beta_0 = pset1<T>(4.89352518554385e-03f);
-  const T beta_2 = pset1<T>(2.26843463243900e-03f);
-  const T beta_4 = pset1<T>(1.18534705686654e-04f);
-  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
+                             1.293019623712687916e-13, 1.123643448069621992e-10,
+                             4.492975677839633985e-08, 8.785185266237658698e-06,
+                             8.295161192716231542e-04, 3.437448108450402717e-02,
+                             4.851805297361760360e-01, 1.0};
 
   // Since the polynomials are odd/even, we need x^2.
   const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
 
-  // Evaluate the numerator polynomial p.
-  T p = pmadd(x2, alpha_13, alpha_11);
-  p = pmadd(x2, p, alpha_9);
-  p = pmadd(x2, p, alpha_7);
-  p = pmadd(x2, p, alpha_5);
-  p = pmadd(x2, p, alpha_3);
-  p = pmadd(x2, p, alpha_1);
-  p = pmul(x, p);
+  // Interleave the evaluation of the numerator polynomial p and
+  // denominator polynomial q.
+  T p = ppolevl<T, 8>::run(x2, alpha);
+  T q = ppolevl<T, 9>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
 
-  // Evaluate the denominator polynomial q.
-  T q = pmadd(x2, beta_6, beta_4);
-  q = pmadd(x2, q, beta_2);
-  q = pmadd(x2, q, beta_0);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
+  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
+                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmadd(x3, p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
 
-  // Divide the numerator by the denominator.
-  return pselect(tiny_mask, x, pdiv(p, q));
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
-  const Packet half = pset1<Packet>(0.5f);
-  const Packet x_gt_half = pcmp_le(half, pabs(x));
-  // For |x| in [0:0.5] we use a polynomial approximation of the form
-  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
-  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
-  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
-  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
-  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
-  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+  // For x in [-0.5:0.5] we use a rational approximation of the form
+  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
+  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
+                              -2.5949536095445679e-01, 1.2306328729812676e-01};
+
+  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
+                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
+
   const Packet x2 = pmul(x, x);
-  Packet p = pmadd(C11, x2, C9);
-  p = pmadd(x2, p, C7);
-  p = pmadd(x2, p, C5);
-  p = pmadd(x2, p, C3);
-  p = pmadd(pmul(x, x2), p, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 5>::run(x2, beta);
+  Packet y_small = pmadd(x3, pdiv(p, q), x);
 
   // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
-  const Packet one = pset1<Packet>(1.0f);
-  Packet r = pdiv(padd(one, x), psub(one, x));
-  r = pmul(half, plog(r));
-  return pselect(x_gt_half, r, p);
+  const Packet half = pset1<Packet>(0.5);
+  const Packet one = pset1<Packet>(1.0);
+  Packet y_large = pdiv(padd(one, x), psub(one, x));
+  y_large = pmul(half, plog(y_large));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
 }
 
 template <typename Packet>
@@ -1253,7 +1689,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const
 }
 
 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                            !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
     using Scalar = typename unpacket_traits<Packet>::type;
@@ -1269,7 +1706,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
 };
 
 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                            NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
                                            NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@@ -1288,7 +1726,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
 };
 
 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                            !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
                                            NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
@@ -1303,7 +1742,8 @@ struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<P
 
 // \internal \returns the the sign of a complex number z, defined as z / abs(z).
 template <typename Packet>
-struct psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                            unpacket_traits<Packet>::vectorizable>> {
   static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
     typedef typename unpacket_traits<Packet>::type Scalar;
@@ -1343,7 +1783,7 @@ struct psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Pa
 // This function splits x into the nearest integer n and fractional part r,
 // such that x = n + r holds exactly.
 template <typename Packet>
-EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
   n = pround(x);
   r = psub(x, n);
 }
@@ -1351,7 +1791,7 @@ EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
 // This function computes the sum {s, r}, such that x + y = s_hi + s_lo
 // holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
 template <typename Packet>
-EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
   s_hi = padd(x, y);
   const Packet t = psub(s_hi, x);
   s_lo = psub(y, t);
@@ -1363,11 +1803,18 @@ EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
 // p_hi = fl(x * y).
 template <typename Packet>
-EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
   p_hi = pmul(x, y);
   p_lo = pmsub(x, y, p_hi);
 }
 
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
 #else
 
 // This function implements the Veltkamp splitting. Given a floating point
@@ -1376,9 +1823,9 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
 // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
 template <typename Packet>
-EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
   typedef typename unpacket_traits<Packet>::type Scalar;
-  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
   const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
   const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
   Packet rho = psub(x, gamma);
@@ -1391,7 +1838,7 @@ EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packe
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
 // p_hi = fl(x * y).
 template <typename Packet>
-EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
   Packet x_hi, x_lo, y_hi, y_lo;
   veltkamp_splitting(x, x_hi, x_lo);
   veltkamp_splitting(y, y_hi, y_lo);
@@ -1403,6 +1850,21 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
   p_lo = pmadd(x_lo, y_lo, p_lo);
 }
 
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
 #endif  // EIGEN_VECTORIZE_FMA
 
 // This function implements Dekker's algorithm for the addition
@@ -1412,8 +1874,8 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
 // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
 template <typename Packet>
-EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
-                                Packet& s_hi, Packet& s_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
   const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
   Packet r_hi_1, r_lo_1;
   fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
@@ -1431,8 +1893,8 @@ EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Pa
 // This is a version of twosum for double word numbers,
 // which assumes that |x_hi| >= |y_hi|.
 template <typename Packet>
-EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
-                                     Packet& s_hi, Packet& s_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
   Packet r_hi, r_lo;
   fast_twosum(x_hi, y_hi, r_hi, r_lo);
   const Packet s = padd(padd(y_lo, r_lo), x_lo);
@@ -1443,8 +1905,8 @@ EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, con
 // double word number {y_hi, y_lo} number, with the assumption
 // that |x| >= |y_hi|.
 template <typename Packet>
-EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, Packet& s_hi,
-                                     Packet& s_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
+                                                       Packet& s_hi, Packet& s_lo) {
   Packet r_hi, r_lo;
   fast_twosum(x, y_hi, r_hi, r_lo);
   const Packet s = padd(y_lo, r_lo);
@@ -1460,7 +1922,8 @@ EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const
 // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
 template <typename Packet>
-EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& p_hi, Packet& p_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                   Packet& p_hi, Packet& p_lo) {
   Packet c_hi, c_lo1;
   twoprod(x_hi, y, c_hi, c_lo1);
   const Packet c_lo2 = pmul(x_lo, y);
@@ -1477,8 +1940,8 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const P
 // of less than 2*2^{-2p}, where p is the number of significand bit
 // in the floating point type.
 template <typename Packet>
-EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
-                                 Packet& p_hi, Packet& p_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
   Packet p_hi_hi, p_hi_lo;
   twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
   Packet p_lo_hi, p_lo_lo;
@@ -1487,11 +1950,12 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const P
 }
 
 // This function implements the division of double word {x_hi, x_lo}
-// by float y. This is Algorithm 15 from "Tight and rigourous error bounds
+// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
 // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
 // 2017. https://hal.archives-ouvertes.fr/hal-01351529
 template <typename Packet>
-void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& z_hi, Packet& z_lo) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                             Packet& z_hi, Packet& z_lo) {
   const Packet t_hi = pdiv(x_hi, y);
   Packet pi_hi, pi_lo;
   twoprod(t_hi, y, pi_hi, pi_lo);
@@ -1506,88 +1970,60 @@ void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
 template <typename Scalar>
 struct accurate_log2 {
   template <typename Packet>
-  EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
     log2_x_hi = plog2(x);
     log2_x_lo = pzero(x);
   }
 };
 
 // This specialization uses a more accurate algorithm to compute log2(x) for
-// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
 // This additional accuracy is needed to counter the error-magnification
 // inherent in multiplying by a potentially large exponent in pow(x,y).
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
+// The minimax polynomial used was calculated using the Rminimax tool,
+// see https://gitlab.inria.fr/sfilip/rminimax.
+// Command line:
+//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
+//   --type=[10,0]
+//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
+//
+// The resulting implementation of pow(x,y) is accurate to 3 ulps.
 template <>
 struct accurate_log2<float> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
-    // The function log(1+x)/x is approximated in the interval
-    // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
-    //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
-    // where the degree 6 polynomial P(x) is evaluated in single precision,
-    // while the remaining 4 terms of Q(x), as well as the final multiplication by x
-    // to reconstruct log(1+x) are evaluated in extra precision using
-    // double word arithmetic. C0 through C3 are extra precise constants
-    // stored as double words.
-    //
-    // The polynomial coefficients were calculated using Sollya commands:
-    // > n = 10;
-    // > f = log2(1+x)/x;
-    // > interval = [sqrt(0.5)-1;sqrt(2)-1];
-    // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
-
-    const Packet p6 = pset1<Packet>(9.703654795885e-2f);
-    const Packet p5 = pset1<Packet>(-0.1690667718648f);
-    const Packet p4 = pset1<Packet>(0.1720575392246f);
-    const Packet p3 = pset1<Packet>(-0.1789081543684f);
-    const Packet p2 = pset1<Packet>(0.2050433009862f);
-    const Packet p1 = pset1<Packet>(-0.2404672354459f);
-    const Packet p0 = pset1<Packet>(0.2885761857032f);
-
-    const Packet C3_hi = pset1<Packet>(-0.360674142838f);
-    const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
-    const Packet C2_hi = pset1<Packet>(0.480897903442f);
-    const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
-    const Packet C1_hi = pset1<Packet>(-0.721347510815f);
-    const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
-    const Packet C0_hi = pset1<Packet>(1.44269502163f);
-    const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // Split the two lowest order constant coefficient into double-word representation.
+    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
+    constexpr float kC0_hi = static_cast<float>(kC0);
+    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
+    const Packet c0_hi = pset1<Packet>(kC0_hi);
+    const Packet c0_lo = pset1<Packet>(kC0_lo);
+
+    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
+    constexpr float kC1_hi = static_cast<float>(kC1);
+    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
+    const Packet c1_hi = pset1<Packet>(kC1_hi);
+    const Packet c1_lo = pset1<Packet>(kC1_lo);
+
+    constexpr float c[] = {
+        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
+        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
+        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
+
+    // Evaluate the higher order terms in the polynomial using
+    // standard arithmetic.
     const Packet one = pset1<Packet>(1.0f);
-
     const Packet x = psub(z, one);
-    // Evaluate P(x) in working precision.
-    // We evaluate it in multiple parts to improve instruction level
-    // parallelism.
-    Packet x2 = pmul(x, x);
-    Packet p_even = pmadd(p6, x2, p4);
-    p_even = pmadd(p_even, x2, p2);
-    p_even = pmadd(p_even, x2, p0);
-    Packet p_odd = pmadd(p5, x2, p3);
-    p_odd = pmadd(p_odd, x2, p1);
-    Packet p = pmadd(p_odd, x, p_even);
-
-    // Now evaluate the low-order tems of Q(x) in double word precision.
-    // In the following, due to the alternating signs and the fact that
-    // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
-    // fast_twosum instead of the slower twosum.
-    Packet q_hi, q_lo;
-    Packet t_hi, t_lo;
-    // C3 + x * p(x)
-    twoprod(p, x, t_hi, t_lo);
-    fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
-    // C2 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
-    // C1 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
-    // C0 + x * p(x)
-    twoprod(q_hi, q_lo, x, t_hi, t_lo);
-    fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
-
-    // log(z) ~= x * Q(x)
-    twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
+    Packet p = ppolevl<Packet, 8>::run(x, c);
+    // Evaluate the final two step in Horner's rule using double-word
+    // arithmetic.
+    Packet p_hi, p_lo;
+    twoprod(x, p, p_hi, p_lo);
+    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
+    twoprod(p_hi, p_lo, x, p_hi, p_lo);
+    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
+    // Multiply by x to recover log2(z).
+    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
   }
 };
 
@@ -1601,7 +2037,7 @@ struct accurate_log2<float> {
 template <>
 struct accurate_log2<double> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
     // We use a transformation of variables:
     //    r = c * (x-1) / (x+1),
     // such that
@@ -1682,154 +2118,20 @@ struct accurate_log2<double> {
   }
 };
 
-// This function computes exp2(x) (i.e. 2**x).
-template <typename Scalar>
-struct fast_accurate_exp2 {
-  template <typename Packet>
-  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
-    // TODO(rmlarsen): Add a pexp2 packetop.
-    return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
-  }
-};
-
-// This specialization uses a faster algorithm to compute exp2(x) for floats
-// in [-0.5;0.5] with a relative accuracy of 1 ulp.
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
-template <>
-struct fast_accurate_exp2<float> {
-  template <typename Packet>
-  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
-    // This function approximates exp2(x) by a degree 6 polynomial of the form
-    // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
-    // single precision, and the remaining steps are evaluated with extra precision using
-    // double word arithmetic. C is an extra precise constant stored as a double word.
-    //
-    // The polynomial coefficients were calculated using Sollya commands:
-    // > n = 6;
-    // > f = 2^x;
-    // > interval = [-0.5;0.5];
-    // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
-
-    const Packet p4 = pset1<Packet>(1.539513905e-4f);
-    const Packet p3 = pset1<Packet>(1.340007293e-3f);
-    const Packet p2 = pset1<Packet>(9.618283249e-3f);
-    const Packet p1 = pset1<Packet>(5.550328270e-2f);
-    const Packet p0 = pset1<Packet>(0.2402264923f);
-
-    const Packet C_hi = pset1<Packet>(0.6931471825f);
-    const Packet C_lo = pset1<Packet>(2.36836577e-08f);
-    const Packet one = pset1<Packet>(1.0f);
-
-    // Evaluate P(x) in working precision.
-    // We evaluate even and odd parts of the polynomial separately
-    // to gain some instruction level parallelism.
-    Packet x2 = pmul(x, x);
-    Packet p_even = pmadd(p4, x2, p2);
-    Packet p_odd = pmadd(p3, x2, p1);
-    p_even = pmadd(p_even, x2, p0);
-    Packet p = pmadd(p_odd, x, p_even);
-
-    // Evaluate the remaining terms of Q(x) with extra precision using
-    // double word arithmetic.
-    Packet p_hi, p_lo;
-    // x * p(x)
-    twoprod(p, x, p_hi, p_lo);
-    // C + x * p(x)
-    Packet q1_hi, q1_lo;
-    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
-    // x * (C + x * p(x))
-    Packet q2_hi, q2_lo;
-    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
-    // 1 + x * (C + x * p(x))
-    Packet q3_hi, q3_lo;
-    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
-    // for adding it to unity here.
-    fast_twosum(one, q2_hi, q3_hi, q3_lo);
-    return padd(q3_hi, padd(q2_lo, q3_lo));
-  }
-};
-
-// in [-0.5;0.5] with a relative accuracy of 1 ulp.
-// The minimax polynomial used was calculated using the Sollya tool.
-// See sollya.org.
-template <>
-struct fast_accurate_exp2<double> {
-  template <typename Packet>
-  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
-    // This function approximates exp2(x) by a degree 10 polynomial of the form
-    // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
-    // single precision, and the remaining steps are evaluated with extra precision using
-    // double word arithmetic. C is an extra precise constant stored as a double word.
-    //
-    // The polynomial coefficients were calculated using Sollya commands:
-    // > n = 11;
-    // > f = 2^x;
-    // > interval = [-0.5;0.5];
-    // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
-
-    const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
-    const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
-    const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
-    const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
-    const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
-    const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
-    const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
-    const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
-    const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
-    const Packet p0 = pset1<Packet>(0.240226506959101332);
-    const Packet C_hi = pset1<Packet>(0.693147180559945286);
-    const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
-    const Packet one = pset1<Packet>(1.0);
-
-    // Evaluate P(x) in working precision.
-    // We evaluate even and odd parts of the polynomial separately
-    // to gain some instruction level parallelism.
-    Packet x2 = pmul(x, x);
-    Packet p_even = pmadd(p8, x2, p6);
-    Packet p_odd = pmadd(p9, x2, p7);
-    p_even = pmadd(p_even, x2, p4);
-    p_odd = pmadd(p_odd, x2, p5);
-    p_even = pmadd(p_even, x2, p2);
-    p_odd = pmadd(p_odd, x2, p3);
-    p_even = pmadd(p_even, x2, p0);
-    p_odd = pmadd(p_odd, x2, p1);
-    Packet p = pmadd(p_odd, x, p_even);
-
-    // Evaluate the remaining terms of Q(x) with extra precision using
-    // double word arithmetic.
-    Packet p_hi, p_lo;
-    // x * p(x)
-    twoprod(p, x, p_hi, p_lo);
-    // C + x * p(x)
-    Packet q1_hi, q1_lo;
-    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
-    // x * (C + x * p(x))
-    Packet q2_hi, q2_lo;
-    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
-    // 1 + x * (C + x * p(x))
-    Packet q3_hi, q3_lo;
-    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
-    // for adding it to unity here.
-    fast_twosum(one, q2_hi, q3_hi, q3_lo);
-    return padd(q3_hi, padd(q2_lo, q3_lo));
-  }
-};
-
 // This function implements the non-trivial case of pow(x,y) where x is
 // positive and y is (possibly) non-integer.
 // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
 // TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
 // easier to specialize or turn off for specific types and/or backends.x
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   // Split x into exponent e_x and mantissa m_x.
   Packet e_x;
   Packet m_x = pfrexp(x, e_x);
 
   // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
-  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
   const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
   m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
   e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
@@ -1862,210 +2164,118 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
 
   // We now have an accurate split of f = n_z + r_z and can compute
   //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
-  // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
-  // using a specialized algorithm. Multiplication by the second factor can
-  // be done exactly using pldexp(), since it is an integer power of 2.
-  const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
-  return pldexp(e_r, n_z);
+  // Multiplication by the second factor can be done exactly using pldexp(), since
+  // it is an integer power of 2.
+  const Packet e_r = generic_exp2(r_z);
+
+  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
+  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
+  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
+  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
+  if (predux_any(pldexp_fast_unsafe)) {
+    return pldexp(e_r, n_z);
+  }
+  return pldexp_fast(e_r, n_z);
 }
 
 // Generic implementation of pow(x,y).
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
+    const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
 
-  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
-  const Packet cst_neg_inf = pset1<Packet>(-NumTraits<Scalar>::infinity());
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
   const Packet cst_zero = pset1<Packet>(Scalar(0));
   const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
 
-  const Packet abs_x = pabs(x);
+  const Packet x_abs = pabs(x);
+  Packet pow = generic_pow_impl(x_abs, y);
+
+  // In the following we enforce the special case handling prescribed in
+  // https://en.cppreference.com/w/cpp/numeric/math/pow.
+
   // Predicates for sign and magnitude of x.
-  const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
+  const Packet x_is_negative = pcmp_lt(x, cst_zero);
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_one = pcmp_eq(x, cst_one);
   const Packet x_has_signbit = psignbit(x);
-  const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
-  const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
-  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
-  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
-  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
-  const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
-  const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
-  const Packet x_is_nan = pisnan(x);
+  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
+  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
 
   // Predicates for sign and magnitude of y.
-  const Packet abs_y = pabs(y);
+  const Packet y_abs = pabs(y);
+  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
+  const Packet y_is_negative = pcmp_lt(y, cst_zero);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
   const Packet y_is_one = pcmp_eq(y, cst_one);
-  const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero);
-  const Packet y_is_neg = pcmp_lt(y, cst_zero);
-  const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg));
-  const Packet y_is_nan = pisnan(y);
-  const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf);
-  EIGEN_CONSTEXPR Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
-  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
-
-  // Predicates for whether y is integer and/or even.
-  const Packet y_is_int = pcmp_eq(pfloor(y), y);
+  // Predicates for whether y is integer and odd/even.
+  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
   const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
   const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
-
-  // Predicates encoding special cases for the value of pow(x,y)
-  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
-  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
-  const Packet pow_is_one =
-      por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
-  const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
-                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
-                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
-  const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
-                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
-                                pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
-  const Packet pow_is_neg_zero = pand(pandnot(y_is_int, y_is_even),
-                                      por(pand(y_is_neg, pand(abs_x_is_inf, x_is_neg)), pand(y_is_pos, x_is_neg_zero)));
-  const Packet inf_val =
-      pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
-              cst_neg_inf, cst_pos_inf);
-  // General computation of pow(x,y) for positive x or negative x and integer y.
-  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
-  const Packet pow_abs = generic_pow_impl(abs_x, y);
-  return pselect(y_is_one, x,
-                 pselect(pow_is_one, cst_one,
-                         pselect(pow_is_nan, cst_nan,
-                                 pselect(pow_is_inf, inf_val,
-                                         pselect(pow_is_neg_zero, pnegate(cst_zero),
-                                                 pselect(pow_is_zero, cst_zero,
-                                                         pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
+  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
+  // Smallest exponent for which (1 + epsilon) overflows to infinity.
+  constexpr Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
+  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
+
+  // *  pow(base, exp) returns NaN if base is finite and negative
+  //    and exp is finite and non-integer.
+  pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
+
+  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
+  // a non-integer, returns +∞
+  // * pow(±0, exp), where exp is positive non-integer or a positive even
+  // integer, returns +0
+  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
+  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // * pow(+0, exp), where exp is a positive odd integer, returns +0
+  // * pow(-0, exp), where exp is a positive odd integer, returns -0
+  // Sign is flipped by the rule below.
+  pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
+
+  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
+  // and exp is an odd integer exponent.
+  pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
+
+  // * pow(base, -∞) returns +∞ for any |base|<1
+  // * pow(base, -∞) returns +0 for any |base|>1
+  // * pow(base, +∞) returns +0 for any |base|<1
+  // * pow(base, +∞) returns +∞ for any |base|>1
+  // * pow(±0, -∞) returns +∞
+  // * pow(-1, +-∞) = 1
+  Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
+  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
+  pow = pselect(y_abs_is_huge, inf_y_val, pow);
+
+  // * pow(+∞, exp) returns +0 for any negative exp
+  // * pow(+∞, exp) returns +∞ for any positive exp
+  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
+  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
+  //     even integer.
+  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
+  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
+  //     even integer.
+  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
+  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
+  pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
+
+  // All cases of NaN inputs return NaN, except the two below.
+  pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
+
+  // * pow(base, 1) returns base.
+  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
+  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
+  pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
+
+  return pow;
 }
 
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
-
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
-                                                          const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
+template <typename Scalar>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
+    const Scalar& x, const Scalar& y) {
+  return numext::pow(x, y);
+}
 
 namespace unary_pow {
 
@@ -2148,35 +2358,36 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const Scal
 }
 
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
-                                                     const typename unpacket_traits<Packet>::type& exponent) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
+    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
   const Packet exponent_packet = pset1<Packet>(exponent);
   return generic_pow_impl(x, exponent_packet);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
+    const Scalar& x, const Scalar& exponent) {
+  return numext::pow(x, exponent);
+}
+
 template <typename Packet, typename ScalarExponent>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
                                                                          const ScalarExponent& exponent) {
   using Scalar = typename unpacket_traits<Packet>::type;
 
   // non-integer base and exponent case
-
-  const Scalar pos_zero = Scalar(0);
-  const Scalar all_ones = ptrue<Scalar>(Scalar());
-  const Scalar pos_one = Scalar(1);
-  const Scalar pos_inf = NumTraits<Scalar>::infinity();
-
   const Packet cst_pos_zero = pzero(x);
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-  const Packet cst_pos_inf = pset1<Packet>(pos_inf);
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_true = ptrue<Packet>(x);
 
   const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
   const bool exponent_is_neg = exponent < ScalarExponent(0);
   const bool exponent_is_pos = exponent > ScalarExponent(0);
 
-  const Packet exp_is_not_fin = pset1<Packet>(exponent_is_not_fin ? all_ones : pos_zero);
-  const Packet exp_is_neg = pset1<Packet>(exponent_is_neg ? all_ones : pos_zero);
-  const Packet exp_is_pos = pset1<Packet>(exponent_is_pos ? all_ones : pos_zero);
+  const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
+  const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
+  const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
   const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
   const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
 
@@ -2208,26 +2419,19 @@ template <typename Packet, typename ScalarExponent,
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
   using Scalar = typename unpacket_traits<Packet>::type;
 
-  // singed integer base, signed integer exponent case
+  // signed integer base, signed integer exponent case
 
   // This routine handles negative exponents.
   // The return value is either 0, 1, or -1.
-
-  const Scalar pos_zero = Scalar(0);
-  const Scalar all_ones = ptrue<Scalar>(Scalar());
-  const Scalar pos_one = Scalar(1);
-
-  const Packet cst_pos_one = pset1<Packet>(pos_one);
-
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
   const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
-
-  const Packet exp_is_odd = pset1<Packet>(exponent_is_odd ? all_ones : pos_zero);
+  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
 
   const Packet abs_x = pabs(x);
   const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
 
   Packet result = pselect(exp_is_odd, x, abs_x);
-  result = pand(abs_x_is_one, result);
+  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
   return result;
 }
 
@@ -2264,7 +2468,12 @@ struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
     const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
     if (exponent_is_integer) {
-      return unary_pow::int_pow(x, exponent);
+      // The simple recursive doubling implementation is only accurate to 3 ulps
+      // for integer exponents in [-3:7]. Since this is a common case, we
+      // specialize it here.
+      bool use_repeated_squaring =
+          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
+      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
     } else {
       Packet result = unary_pow::gen_pow(x, exponent);
       result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
@@ -2301,6 +2510,124 @@ struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
   }
 };
 
+// This function computes exp2(x) = exp(ln(2) * x).
+// To improve accuracy, the product ln(2)*x is computed using the twoprod
+// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
+// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
+// correction step this reduces the maximum absolute error as follows:
+//
+// type   | max error (simple product) | max error (twoprod) |
+// -----------------------------------------------------------
+// float  |       35 ulps              |       4 ulps        |
+// double |      363 ulps              |     110 ulps        |
+//
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
+  constexpr int digits = std::numeric_limits<Scalar>::digits;
+  constexpr Scalar max_cap = Scalar(max_exponent + 1);
+  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
+  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
+  Packet p_hi, p_lo;
+  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
+  Packet exp2_hi = pexp(p_hi);
+  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
+  return pmul(exp2_hi, exp2_lo);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
+  // Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
+  const IntType kLimit = IntType(1) << (NumTraits<Scalar>::digits() - 1);
+  const Packet cst_limit = pset1<Packet>(static_cast<Scalar>(kLimit));
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet rint_a = padd(abs_a, cst_limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(rint_a);
+  rint_a = psub(rint_a, cst_limit);
+  rint_a = por(rint_a, sign_a);
+  // If greater than limit (or NaN), simply return a.
+  Packet mask = pcmp_lt(abs_a, cst_limit);
+  Packet result = pselect(mask, rint_a, a);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  Packet rint_a = generic_rint(a);
+  // if a < rint(a), then rint(a) == ceil(a)
+  Packet mask = pcmp_lt(a, rint_a);
+  Packet offset = pand(cst_1, mask);
+  Packet result = psub(rint_a, offset);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  const Packet sign_mask = pset1<Packet>(static_cast<Scalar>(-0.0));
+  Packet rint_a = generic_rint(a);
+  // if rint(a) < a, then rint(a) == floor(a)
+  Packet mask = pcmp_lt(rint_a, a);
+  Packet offset = pand(cst_1, mask);
+  Packet result = padd(rint_a, offset);
+  // Signed zero must remain signed (e.g. ceil(-0.02) == -0).
+  result = por(result, pand(sign_mask, a));
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet floor_abs_a = generic_floor(abs_a);
+  Packet result = por(floor_abs_a, sign_a);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_half = pset1<Packet>(Scalar(0.5));
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet floor_abs_a = generic_floor(abs_a);
+  Packet diff = psub(abs_a, floor_abs_a);
+  Packet mask = pcmp_le(cst_half, diff);
+  Packet offset = pand(cst_1, mask);
+  Packet result = padd(floor_abs_a, offset);
+  result = por(result, sign_a);
+  return result;
+}
+
+template <typename Packet>
+struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ false> {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static_assert(packet_traits<Scalar>::HasRound, "Generic nearest integer functions are disabled for this type.");
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return generic_floor(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return generic_ceil(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return generic_rint(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return generic_round(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return generic_trunc(x); }
+};
+
+template <typename Packet>
+struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return x; }
+};
+
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 9560de2..673954e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -21,7 +21,7 @@ namespace internal {
 // This is needed to workaround a circular dependency.
 
 /***************************************************************************
- * Some generic implementations to be used by implementors
+ * Some generic implementations to be used by implementers
  ***************************************************************************/
 
 /** Default implementation of pfrexp.
@@ -42,6 +42,26 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(
 template <typename Packet>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
 
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
+
+/** \internal \returns cbrt(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x_in);
+
+/** \internal \returns cbrt(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x_in);
+
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
@@ -60,11 +80,19 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Pa
 
 /** \internal \returns log(1 + x) */
 template <typename Packet>
-Packet generic_plog1p(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x);
 
 /** \internal \returns exp(x)-1 */
 template <typename Packet>
-Packet generic_expm1(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x);
+
+/** \internal \returns atan(x) */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x);
+
+/** \internal \returns exp2(x) */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x);
 
 /** \internal \returns exp(x) for single precision float */
 template <typename Packet>
@@ -82,30 +110,38 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Pack
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
 
-/** \internal \returns asin(x) for single precision float */
+/** \internal \returns sin(x) for double precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
 
-/** \internal \returns acos(x) for single precision float */
+/** \internal \returns cos(x) for double precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
 
-/** \internal \returns atan(x) for single precision float */
+/** \internal \returns asin(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
 
-/** \internal \returns atan(x) for double precision float */
+/** \internal \returns acos(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x);
 
 /** \internal \returns tanh(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh_float(const Packet& x);
 
+/** \internal \returns tanh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh_double(const Packet& x);
+
 /** \internal \returns atanh(x) for single precision float */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x);
 
+/** \internal \returns atanh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x);
+
 /** \internal \returns sqrt(x) for complex types */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a);
@@ -125,6 +161,21 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Pa
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& x);
 
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
+
 // Macros for instantiating these generic functions for different backends.
 #define EIGEN_PACKET_FUNCTION(METHOD, SCALAR, PACKET)                                             \
   template <>                                                                                     \
@@ -132,34 +183,43 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Pa
     return p##METHOD##_##SCALAR(_x);                                                              \
   }
 
+// Macros for instantiating these generic functions for different backends.
+#define EIGEN_GENERIC_PACKET_FUNCTION(METHOD, PACKET)                                             \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET p##METHOD<PACKET>(const PACKET& _x) { \
+    return generic_##METHOD(_x);                                                                  \
+  }
+
 #define EIGEN_FLOAT_PACKET_FUNCTION(METHOD, PACKET) EIGEN_PACKET_FUNCTION(METHOD, float, PACKET)
 #define EIGEN_DOUBLE_PACKET_FUNCTION(METHOD, PACKET) EIGEN_PACKET_FUNCTION(METHOD, double, PACKET)
 
-#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET)                                     \
-  EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET)                                                     \
-  EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET)                                                     \
-  EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET)                                                    \
-  EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET)                                                    \
-  EIGEN_FLOAT_PACKET_FUNCTION(atan, PACKET)                                                    \
-  EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET)                                                    \
-  EIGEN_FLOAT_PACKET_FUNCTION(atanh, PACKET)                                                   \
-  EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET)                                                     \
-  EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET)                                                    \
-  EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET)                                                     \
-  template <>                                                                                  \
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET pexpm1<PACKET>(const PACKET& _x) { \
-    return internal::generic_expm1(_x);                                                        \
-  }                                                                                            \
-  template <>                                                                                  \
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET plog1p<PACKET>(const PACKET& _x) { \
-    return internal::generic_plog1p(_x);                                                       \
-  }
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
+  EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(atanh, PACKET)               \
+  EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET)                \
+  EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET)             \
+  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)              \
+  EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET)             \
+  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)
 
 #define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
-  EIGEN_DOUBLE_PACKET_FUNCTION(atan, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(atanh, PACKET)               \
   EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET)                 \
   EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET)                \
-  EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)
+  EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET)                \
+  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)               \
+  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 17d534d..210dfff 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -37,21 +37,23 @@
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
 
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
 // When compiling with GPU support, the "__half_raw" base class as well as
 // some other routines are defined in the GPU compiler header files
 // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
 // As a consequence, we get compile failures when compiling Eigen with
 // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
-// Eigen with GPU support
-#pragma push_macro("EIGEN_CONSTEXPR")
-#undef EIGEN_CONSTEXPR
-#define EIGEN_CONSTEXPR
+// Eigen with GPU support.
+// Any functions that require `numext::bit_cast` may also not be constexpr,
+// including any native types when setting via raw bit values.
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#define _EIGEN_MAYBE_CONSTEXPR
+#else
+#define _EIGEN_MAYBE_CONSTEXPR constexpr
 #endif
 
 #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)                                                  \
   template <>                                                                                              \
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+  EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
     return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
   }
 
@@ -81,8 +83,10 @@ namespace half_impl {
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
 #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
+  struct construct_from_rep_tag {};
 #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
   // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
   // The element type for shared memory cannot have non-trivial constructors
@@ -91,43 +95,53 @@ struct __half_raw {
   // hence the need for this
   EIGEN_DEVICE_FUNC __half_raw() {}
 #else
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {}
 #endif
+
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {}
   __fp16 x;
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {}
+  _Float16 x;
 #else
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
+  explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {}
   numext::uint16_t x;
 #endif
 };
 
 #elif defined(EIGEN_HAS_HIP_FP16)
-// Nothing to do here
+// HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
+
+// CUDA GPU compile phase.
 #if EIGEN_CUDA_SDK_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
 typedef __half __half_raw;
 #endif  // defined(EIGEN_HAS_CUDA_FP16)
+
 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
 #endif
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
 struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
 #elif defined(EIGEN_HAS_CUDA_FP16)
 #if EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
 #endif
@@ -156,21 +170,29 @@ struct half : public half_impl::half_base {
 #endif
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #elif defined(EIGEN_HAS_CUDA_FP16)
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
 #endif
 
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#endif
+
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
   template <class T>
   explicit EIGEN_DEVICE_FUNC half(T val)
@@ -201,93 +223,99 @@ struct half : public half_impl::half_base {
 namespace half_impl {
 template <typename = void>
 struct numeric_limits_half_impl {
-  static EIGEN_CONSTEXPR const bool is_specialized = true;
-  static EIGEN_CONSTEXPR const bool is_signed = true;
-  static EIGEN_CONSTEXPR const bool is_integer = false;
-  static EIGEN_CONSTEXPR const bool is_exact = false;
-  static EIGEN_CONSTEXPR const bool has_infinity = true;
-  static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
-  static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
-  static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
-  static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
-  static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest;
-  static EIGEN_CONSTEXPR const bool is_iec559 = true;
+  static constexpr const bool is_specialized = true;
+  static constexpr const bool is_signed = true;
+  static constexpr const bool is_integer = false;
+  static constexpr const bool is_exact = false;
+  static constexpr const bool has_infinity = true;
+  static constexpr const bool has_quiet_NaN = true;
+  static constexpr const bool has_signaling_NaN = true;
+  EIGEN_DIAGNOSTICS(push)
+  EIGEN_DISABLE_DEPRECATED_WARNING
+  static constexpr const std::float_denorm_style has_denorm = std::denorm_present;
+  static constexpr const bool has_denorm_loss = false;
+  EIGEN_DIAGNOSTICS(pop)
+  static constexpr const std::float_round_style round_style = std::round_to_nearest;
+  static constexpr const bool is_iec559 = true;
   // The C++ standard defines this as "true if the set of values representable
   // by the type is finite." Half has finite precision.
-  static EIGEN_CONSTEXPR const bool is_bounded = true;
-  static EIGEN_CONSTEXPR const bool is_modulo = false;
-  static EIGEN_CONSTEXPR const int digits = 11;
-  static EIGEN_CONSTEXPR const int digits10 =
+  static constexpr const bool is_bounded = true;
+  static constexpr const bool is_modulo = false;
+  static constexpr const int digits = 11;
+  static constexpr const int digits10 =
       3;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static EIGEN_CONSTEXPR const int max_digits10 =
+  static constexpr const int max_digits10 =
       5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
-  static EIGEN_CONSTEXPR const int min_exponent = -13;
-  static EIGEN_CONSTEXPR const int min_exponent10 = -4;
-  static EIGEN_CONSTEXPR const int max_exponent = 16;
-  static EIGEN_CONSTEXPR const int max_exponent10 = 4;
-  static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps;
+  static constexpr const int radix = std::numeric_limits<float>::radix;
+  static constexpr const int min_exponent = -13;
+  static constexpr const int min_exponent10 = -4;
+  static constexpr const int max_exponent = 16;
+  static constexpr const int max_exponent10 = 4;
+  static constexpr const bool traps = std::numeric_limits<float>::traps;
   // IEEE754: "The implementer shall choose how tininess is detected, but shall
   // detect tininess in the same way for all operations in radix two"
-  static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
-
-  static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
-  static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
-  static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
-  static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
-  static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
-  static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
-  static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
+  static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
+
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
 };
 
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized;
+constexpr const bool numeric_limits_half_impl<T>::is_specialized;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed;
+constexpr const bool numeric_limits_half_impl<T>::is_signed;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer;
+constexpr const bool numeric_limits_half_impl<T>::is_integer;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact;
+constexpr const bool numeric_limits_half_impl<T>::is_exact;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity;
+constexpr const bool numeric_limits_half_impl<T>::has_infinity;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN;
+constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN;
+EIGEN_DIAGNOSTICS(push)
+EIGEN_DISABLE_DEPRECATED_WARNING
 template <typename T>
-EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
+constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss;
+constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss;
+EIGEN_DIAGNOSTICS(pop)
 template <typename T>
-EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style;
+constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559;
+constexpr const bool numeric_limits_half_impl<T>::is_iec559;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded;
+constexpr const bool numeric_limits_half_impl<T>::is_bounded;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo;
+constexpr const bool numeric_limits_half_impl<T>::is_modulo;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits;
+constexpr const int numeric_limits_half_impl<T>::digits;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10;
+constexpr const int numeric_limits_half_impl<T>::digits10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10;
+constexpr const int numeric_limits_half_impl<T>::max_digits10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix;
+constexpr const int numeric_limits_half_impl<T>::radix;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent;
+constexpr const int numeric_limits_half_impl<T>::min_exponent;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10;
+constexpr const int numeric_limits_half_impl<T>::min_exponent10;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent;
+constexpr const int numeric_limits_half_impl<T>::max_exponent;
 template <typename T>
-EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10;
+constexpr const int numeric_limits_half_impl<T>::max_exponent10;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps;
+constexpr const bool numeric_limits_half_impl<T>::traps;
 template <typename T>
-EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before;
+constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
 }  // end namespace half_impl
 }  // end namespace Eigen
 
@@ -313,9 +341,8 @@ namespace half_impl {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
-// fp16 type since GPU halfs are rather different from native CPU halfs.
-// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16
-#define EIGEN_HAS_NATIVE_FP16
+// fp16 type since GPU half types are rather different from native CPU half types.
+#define EIGEN_HAS_NATIVE_GPU_FP16
 #endif
 
 // Intrinsics for native fp16 support. Note that on current hardware,
@@ -323,7 +350,7 @@ namespace half_impl {
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
 
-#if defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
 EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hadd(::__half(a), ::__half(b));
@@ -365,7 +392,8 @@ EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { re
 EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
 EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
 EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
-#endif
+
+#endif  // EIGEN_HAS_NATIVE_GPU_FP16
 
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
@@ -395,16 +423,47 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half&
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; }
+
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
-#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
+#elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -438,20 +497,61 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
   a = half(float(a) / float(b));
   return a;
 }
+
+// Non-negative floating point numbers have a monotonic mapping to non-negative integers.
+// This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
+// is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
+// representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
+// two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
+// infinity) are handled automatically, except NaN.
+//
+// fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
+// bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
+// NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
+// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+
+// convert sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
+  return (a >> 15) ? -(a & kAbsMask) : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
-  return numext::equal_strict(float(a), float(b));
+  bool result = mapToSigned(a.x) == mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) < mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) <= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
-  return numext::not_equal_strict(float(a), float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) > mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) >= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
 #endif
+
 #endif  // Emulate support for half floats
 
 // Division by an index. Do it in full float precision to avoid accuracy
@@ -487,7 +587,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
   // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
   // in the hip_fp16 header file, and that will trigger a compile error
   // On the other hand, having anything but a return statement also triggers a compile error
@@ -509,6 +609,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
   // For SYCL, cl::sycl::half is _Float16, so cast directly.
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return numext::bit_cast<numext::uint16_t>(h.x);
 #elif defined(SYCL_DEVICE_ONLY)
   return numext::bit_cast<numext::uint16_t>(h);
 #else
@@ -516,17 +618,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
 #endif
 }
 
-union float32_bits {
-  unsigned int u;
-  float f;
-};
-
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  __half_raw h;
+  h.x = static_cast<__fp16>(ff);
+  return h;
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  __half_raw h;
+  h.x = static_cast<_Float16>(ff);
+  return h;
+
 #elif defined(EIGEN_HAS_FP16_C)
   __half_raw h;
 #if EIGEN_COMP_MSVC
@@ -537,52 +644,46 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 #endif
   return h;
 
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  __half_raw h;
-  h.x = static_cast<__fp16>(ff);
-  return h;
-
 #else
-  float32_bits f;
-  f.f = ff;
-
-  const float32_bits f32infty = {255 << 23};
-  const float32_bits f16max = {(127 + 16) << 23};
-  const float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
-  unsigned int sign_mask = 0x80000000u;
+  uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff);
+  const uint32_t f32infty_bits = {255 << 23};
+  const uint32_t f16max_bits = {(127 + 16) << 23};
+  const uint32_t denorm_magic_bits = {((127 - 15) + (23 - 10) + 1) << 23};
+  const uint32_t sign_mask = 0x80000000u;
   __half_raw o;
-  o.x = static_cast<numext::uint16_t>(0x0u);
+  o.x = static_cast<uint16_t>(0x0u);
 
-  unsigned int sign = f.u & sign_mask;
-  f.u ^= sign;
+  const uint32_t sign = f_bits & sign_mask;
+  f_bits ^= sign;
 
   // NOTE all the integer compares in this function can be safely
   // compiled into signed compares since all operands are below
   // 0x80000000. Important if you want fast straight SSE2 code
   // (since there's no unsigned PCMPGTD).
 
-  if (f.u >= f16max.u) {                         // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
-  } else {                                       // (De)normalized number or zero
-    if (f.u < (113 << 23)) {                     // resulting FP16 is subnormal or zero
+  if (f_bits >= f16max_bits) {                         // result is Inf or NaN (all exponent bits set)
+    o.x = (f_bits > f32infty_bits) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                             // (De)normalized number or zero
+    if (f_bits < (113 << 23)) {                        // resulting FP16 is subnormal or zero
       // use a magic value to align our 10 mantissa bits at the bottom of
       // the float. as long as FP addition is round-to-nearest-even this
       // just works.
-      f.f += denorm_magic.f;
+      f_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(f_bits) +
+                                                 Eigen::numext::bit_cast<float>(denorm_magic_bits));
 
       // and one integer subtract of the bias later, we have our final float!
-      o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
+      o.x = static_cast<numext::uint16_t>(f_bits - denorm_magic_bits);
     } else {
-      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
+      const uint32_t mant_odd = (f_bits >> 13) & 1;  // resulting mantissa is odd
 
       // update exponent, rounding bias part 1
       // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
       // without arithmetic overflow.
-      f.u += 0xc8000fffU;
+      f_bits += 0xc8000fffU;
       // rounding bias part 2
-      f.u += mant_odd;
+      f_bits += mant_odd;
       // take the bits!
-      o.x = static_cast<numext::uint16_t>(f.u >> 13);
+      o.x = static_cast<numext::uint16_t>(f_bits >> 13);
     }
   }
 
@@ -595,6 +696,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return static_cast<float>(h.x);
 #elif defined(EIGEN_HAS_FP16_C)
 #if EIGEN_COMP_MSVC
   // MSVC does not have scalar instructions.
@@ -602,34 +705,31 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
 #else
   return _cvtsh_ss(h.x);
 #endif
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  return static_cast<float>(h.x);
 #else
-  const float32_bits magic = {113 << 23};
-  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
-  float32_bits o;
-
-  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;  // just the exponent
-  o.u += (127 - 15) << 23;               // exponent adjust
+  const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23));
+  const uint32_t shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  uint32_t o_bits = (h.x & 0x7fff) << 13;     // exponent/mantissa bits
+  const uint32_t exp = shifted_exp & o_bits;  // just the exponent
+  o_bits += (127 - 15) << 23;                 // exponent adjust
 
   // handle exponent special cases
-  if (exp == shifted_exp) {   // Inf/NaN?
-    o.u += (128 - 16) << 23;  // extra exp adjust
-  } else if (exp == 0) {      // Zero/Denormal?
-    o.u += 1 << 23;           // extra exp adjust
-    o.f -= magic.f;           // renormalize
+  if (exp == shifted_exp) {      // Inf/NaN?
+    o_bits += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {         // Zero/Denormal?
+    o_bits += 1 << 23;           // extra exp adjust
+    // renormalize
+    o_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(o_bits) - magic);
   }
 
-  o.u |= (h.x & 0x8000) << 16;  // sign bit
-  return o.f;
+  o_bits |= (h.x & 0x8000) << 16;  // sign bit
+  return Eigen::numext::bit_cast<float>(o_bits);
 #endif
 }
 
 // --- standard functions ---
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
 #else
   return (a.x & 0x7fff) == 0x7c00;
@@ -639,19 +739,28 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
-#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
 #else
   return (a.x & 0x7fff) > 0x7c00;
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
+#else
+  return (a.x & 0x7fff) < 0x7c00;
+#endif
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return half(vabsh_f16(a.x));
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  half result;
+  result.x =
+      numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF));
+  return result;
 #else
   half result;
   result.x = a.x & 0x7FFF;
@@ -666,12 +775,20 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
   return half(::expf(float(a)));
 #endif
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hexp2(a));
+#else
+  return half(::exp2f(float(a)));
+#endif
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
      EIGEN_CUDA_ARCH >= 530) ||                                                                 \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-  return half(::hlog(a));
+  return half(hlog(a));
 #else
   return half(::logf(float(a)));
 #endif
@@ -722,28 +839,24 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { return half(::rintf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { return half(::roundf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half trunc(const half& a) { return half(::truncf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
   return half(::fmodf(float(a), float(b)));
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-  return __hlt(b, a) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-  return __hlt(a, b) ? b : a;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; }
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; }
+
+EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vfmah_f16(c.x, a.x, b.x));
+#elif defined(EIGEN_VECTORIZE_AVX512FP16)
+  // Reduces to vfmadd213sh.
+  return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x))));
 #else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
+  // Emulate FMA via float.
+  return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
 #endif
 }
 
@@ -786,31 +899,29 @@ template <>
 struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
   enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
     return half_impl::raw_uint16_to_half(0x0800);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
     return half_impl::raw_uint16_to_half(0x211f);  //  Eigen::half(1e-2f);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
     return half_impl::raw_uint16_to_half(0x7bff);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
     return half_impl::raw_uint16_to_half(0xfbff);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
     return half_impl::raw_uint16_to_half(0x7c00);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
     return half_impl::raw_uint16_to_half(0x7e00);
   }
 };
 
 }  // end namespace Eigen
 
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-#pragma pop_macro("EIGEN_CONSTEXPR")
-#endif
+#undef _EIGEN_MAYBE_CONSTEXPR
 
 namespace Eigen {
 namespace numext {
@@ -844,6 +955,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(c
   return Eigen::half_impl::raw_half_as_uint16(src);
 }
 
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template<>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y, const Eigen::half& z) {
+  return Eigen::half(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
 }  // namespace numext
 }  // namespace Eigen
 
diff --git a/Eigen/src/Core/arch/GPU/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
index 606215f..81bc8bb 100644
--- a/Eigen/src/Core/arch/GPU/MathFunctions.h
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -53,6 +53,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp<double2>(const double2& a) {
   return make_double2(exp(a.x), exp(a.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexp2<float4>(const float4& a) {
+  return make_float4(exp2f(a.x), exp2f(a.y), exp2f(a.z), exp2f(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp2<double2>(const double2& a) {
+  using ::exp;
+  return make_double2(exp2(a.x), exp2(a.y));
+}
+
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexpm1<float4>(const float4& a) {
   return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 7900b0e..328b1b9 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -31,6 +31,15 @@ namespace internal {
 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
 #endif
 
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
+#else
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
+#endif
+
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
@@ -77,6 +86,7 @@ struct packet_traits<float> : default_packet_traits {
 
     HasBlend = 0,
     HasFloor = 1,
+    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
   };
 };
 
@@ -107,9 +117,7 @@ struct packet_traits<double> : default_packet_traits {
     HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
-
     HasBlend = 0,
-    HasFloor = 1,
   };
 };
 
@@ -147,10 +155,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from)
   return make_double2(from, from);
 }
 
-// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
-// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
-// of the functions, while the latter can only deal with one of them.
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
   return __int_as_float(__float_as_int(a) & __float_as_int(b));
@@ -263,8 +268,7 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
   return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
 }
-#endif  // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG &&
-        // !EIGEN_COMP_NVCC)
+#endif  // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
@@ -519,6 +523,33 @@ EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
   return make_double2(floor(a.x), floor(a.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
+  return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
+  return make_double2(ceil(a.x), ceil(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
+  return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
+  return make_double2(rint(a.x), rint(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
+  return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
+  return make_double2(trunc(a.x), trunc(a.y));
+}
+
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
   float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index 6bea9ac..402d92f 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -34,7 +34,7 @@ class TupleImpl<N, T1, Ts...> {
   template <typename U1 = T1,
             typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
                                                  reduce_all<std::is_default_constructible<Ts>::value...>::value>>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+  constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
 
   // Element constructor.
   template <typename U1, typename... Us,
@@ -44,7 +44,7 @@ class TupleImpl<N, T1, Ts...> {
                 sizeof...(Us) == sizeof...(Ts) && (
                                                       // this does not look like a copy/move constructor.
                                                       N > 1 || std::is_convertible<U1, T1>::value)>>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+  constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
       : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
 
   // The first stored value.
@@ -102,11 +102,11 @@ struct tuple_get_impl {
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
     return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
     return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 };
@@ -117,11 +117,9 @@ struct tuple_get_impl<0, T1, Ts...> {
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = T1;
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) {
-    return tuple.head();
-  }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { return tuple.head(); }
 };
 
 // Concatenates N Tuples.
@@ -139,11 +137,9 @@ struct tuple_cat_impl<NTuples, TupleImpl<N1, Args1...>, TupleImpl<N2, Args2...>,
   // Uses the index sequences to extract and merge elements from tuple1 and tuple2,
   // then recursively calls again.
   template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1,
-                                                                              std::index_sequence<I1s...>,
-                                                                              Tuple2&& tuple2,
-                                                                              std::index_sequence<I2s...>,
-                                                                              MoreTuples&&... tuples) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
+                                                                        Tuple2&& tuple2, std::index_sequence<I2s...>,
+                                                                        MoreTuples&&... tuples) {
     return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
         MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
                         tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
@@ -152,8 +148,8 @@ struct tuple_cat_impl<NTuples, TupleImpl<N1, Args1...>, TupleImpl<N2, Args2...>,
 
   // Concatenates the first two tuples.
   template <typename Tuple1, typename Tuple2, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
-                                                                              MoreTuples&&... tuples) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+                                                                        MoreTuples&&... tuples) {
     return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
                std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
   }
@@ -165,7 +161,7 @@ struct tuple_cat_impl<1, TupleImpl<N, Args...>> {
   using ReturnType = TupleImpl<N, Args...>;
 
   template <typename Tuple1>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
     return tuple1;
   }
 };
@@ -174,7 +170,7 @@ struct tuple_cat_impl<1, TupleImpl<N, Args...>> {
 template <>
 struct tuple_cat_impl<0> {
   using ReturnType = TupleImpl<0>;
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
 };
 
 // For use in make_tuple, unwraps a reference_wrapper.
@@ -211,13 +207,13 @@ struct tuple_size<TupleImpl<sizeof...(Types), Types...>> : std::integral_constan
  * \return a reference to the desired element.
  */
 template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
     const TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
 
 template <size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
     TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
@@ -229,7 +225,7 @@ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Id
  */
 template <typename... Tuples, typename EnableIf = std::enable_if_t<
                                   internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
     tuple_cat(Tuples&&... tuples) {
   return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
@@ -239,7 +235,7 @@ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  * Tie arguments together into a tuple.
  */
 template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) noexcept {
   return ReturnType{args...};
 }
 
@@ -247,7 +243,7 @@ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... ar
  * Create a tuple of l-values with the supplied arguments.
  */
 template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
   return ReturnType{std::forward<Args>(args)...};
 }
 
@@ -255,8 +251,7 @@ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args
  * Forward a set of arguments as a tuple.
  */
 template <typename... Args>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(
-    Args&&... args) {
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
   return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
 }
 
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index 7e139de..9b6ceb3 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -161,9 +161,6 @@ struct packet_traits<float> : default_packet_traits {
     HasBlend = 0,
 
     HasDiv = 0,
-    HasFloor = 0,
-    HasCeil = 0,
-    HasRint = 0,
 
     HasSin = 0,
     HasCos = 0,
@@ -402,9 +399,26 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
   return pnegate_hvx(a);
 }
 
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
+  return ptrue_hvx(a);
+}
+
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
 }
@@ -423,7 +437,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
@@ -442,7 +456,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
@@ -461,7 +475,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
 
 template <HVXPacketSize T>
 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
-  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
+  HVX_Vector v_true = ptrue(a).Get();
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
   return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
 }
diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h
new file mode 100644
index 0000000..0b60a83
--- /dev/null
+++ b/Eigen/src/Core/arch/LSX/Complex.h
@@ -0,0 +1,520 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// copyright (c) 2023 zang ruochen <zangruochen@loongson.cn>
+// copyright (c) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_LSX_H
+#define EIGEN_COMPLEX_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
+  Packet4f v;
+};
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasLog = 1,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(__lsx_vfadd_s(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(__lsx_vfsub_s(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  const uint32_t b[4] = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
+  Packet4i mask = (Packet4i)__lsx_vld(b, 0);
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  const uint32_t b[4] = {0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u};
+  Packet4i mask = (__m128i)__lsx_vld(b, 0);
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f part0_tmp = (Packet4f)__lsx_vfmul_s(a.v, b.v);
+  Packet4f part0 = __lsx_vfsub_s(part0_tmp, (__m128)__lsx_vshuf4i_w(part0_tmp, 0x31));
+  Packet4f part1_tmp = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(a.v, 0xb1), b.v);
+  Packet4f part1 = __lsx_vfadd_s(part1_tmp, (__m128)__lsx_vshuf4i_w(part1_tmp, 0x31));
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vpackev_w((__m128i)part1, (__m128i)part0);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(ptrue(Packet4f(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f re = {f0, f0, f0, f0};
+  Packet4f im = {f1, f1, f1, f1};
+  return Packet2cf((Packet4f)__lsx_vilvl_w((__m128i)im, (__m128i)re));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  Packet2cf res;
+  __m128i tmp = __lsx_vldrepl_d(from, 0);
+  __m128i tmp1 = __lsx_vldrepl_d(from + stride, 0);
+  tmp = __lsx_vilvl_d(tmp1, tmp);
+  res.v = (__m128)tmp;
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  __lsx_vstelm_d((__m128i)from.v, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from.v, to + stride, 0, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  __lsx_vst(a.v, res, 0);
+  return res[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vshuf4i_w(a.v, 0x4e);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  return pfirst(Packet2cf(__lsx_vfadd_s(a.v, vec4f_movehl(a.v, a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  return pfirst(pmul(a, Packet2cf(vec4f_movehl(a.v, a.v))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) {
+  __m128 v = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (Packet2cf)v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  Packet2cf result, t0, t1, t2;
+  t1 = pzero(t1);
+  t0.v = (__m128)__lsx_vpackev_w((__m128i)a.v, (__m128i)a.v);
+  t2.v = __lsx_vfmadd_s(t0.v, b.v, c.v);
+  result.v = __lsx_vfadd_s(t2.v, t1.v);
+  t1.v = __lsx_vfsub_s(t1.v, a.v);
+  t1.v = (__m128)__lsx_vpackod_w((__m128i)a.v, (__m128i)t1.v);
+  t2.v = (__m128)__lsx_vshuf4i_w((__m128i)b.v, 0xb1);
+  result.v = __lsx_vfmadd_s(t1.v, t2.v, result.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex(a);
+}
+
+//---------- double ----------
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
+  Packet2d v;
+};
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasLog = 1,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(__lsx_vfadd_d(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(__lsx_vfsub_d(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  const uint64_t tmp[2] = {0x0000000000000000u, 0x8000000000000000u};
+  __m128i mask = __lsx_vld(tmp, 0);
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d tmp_real = __lsx_vfmul_d(a.v, b.v);
+  Packet2d real = __lsx_vfsub_d(tmp_real, preverse(tmp_real));
+
+  Packet2d tmp_imag = __lsx_vfmul_d(preverse(a.v), b.v);
+  Packet2d imag = (__m128d)__lsx_vfadd_d((__m128d)tmp_imag, preverse(tmp_imag));
+  Packet1cd res;
+  res.v = (__m128d)__lsx_vilvl_d((__m128i)imag, (__m128i)real);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
+  return Packet1cd(ptrue(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
+  return res;
+}
+
+// FIXME force unaligned load, this is a temporary fix
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+
+// FIXME force unaligned store, this is a temporary fix
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_ALIGN16 double res[2];
+  __lsx_vst(a.v, res, 0);
+  return std::complex<double>(res[0], res[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  Packet4f tmp1 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  Packet4f tmp2 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[0].v = (Packet4f)__lsx_vshuf4i_w(tmp1, 0xd8);
+  kernel.packet[1].v = (Packet4f)__lsx_vshuf4i_w(tmp2, 0xd8);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = (Packet4f)__lsx_vfcmp_ceq_s(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = (Packet2d)__lsx_vfcmp_ceq_d(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, preverse(eq)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pselect(const Packet2cf& mask, const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vbitsel_v((__m128i)b.v, (__m128i)a.v, (__m128i)mask.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /* a */) {
+  __m128d v = {0.0, 0.0};
+  return (Packet1cd)v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd<Packet1cd>(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  Packet1cd result, t0, t1, t2;
+  t1 = pzero(t1);
+  t0.v = (__m128d)__lsx_vpackev_d((__m128i)a.v, (__m128i)a.v);
+  t2.v = __lsx_vfmadd_d(t0.v, b.v, c.v);
+  result.v = __lsx_vfadd_d(t2.v, t1.v);
+  t1.v = __lsx_vfsub_d(t1.v, a.v);
+  t1.v = (__m128d)__lsx_vpackod_d((__m128i)a.v, (__m128i)t1.v);
+  t2.v = (__m128d)__lsx_vshuf4i_d((__m128i)t2.v, (__m128i)b.v, 0xb);
+  result.v = __lsx_vfmadd_d(t1.v, t2.v, result.v);
+  return result;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index /* stride */) {
+  Packet1cd res;
+  __m128i tmp = __lsx_vld((void*)from, 0);
+  res.v = (__m128d)tmp;
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index /* stride */) {
+  __lsx_vst((__m128i)from.v, (void*)to, 0);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  Packet2d tmp = (__m128d)__lsx_vilvl_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[1].v = (__m128d)__lsx_vilvh_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[0].v = tmp;
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_LSX_H
diff --git a/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000..4b07062
--- /dev/null
+++ b/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h
@@ -0,0 +1,23 @@
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#ifndef EIGEN_LSX_GEBP_NR
+#define EIGEN_LSX_GEBP_NR 8
+#endif
+
+template <>
+struct gebp_traits<float, float, false, false, Architecture::LSX, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  enum { nr = EIGEN_LSX_GEBP_NR };
+};
+
+template <>
+struct gebp_traits<double, double, false, false, Architecture::LSX, GEBPPacketFull>
+    : gebp_traits<double, double, false, false, Architecture::Generic, GEBPPacketFull> {
+  enum { nr = EIGEN_LSX_GEBP_NR };
+};
+}  // namespace internal
+}  // namespace Eigen
diff --git a/Eigen/src/Core/arch/LSX/MathFunctions.h b/Eigen/src/Core/arch/LSX/MathFunctions.h
new file mode 100644
index 0000000..cead463
--- /dev/null
+++ b/Eigen/src/Core/arch/LSX/MathFunctions.h
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 XiWei Gu (guxiwei-hf@loongson.cn)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_LSX_H
+#define EIGEN_MATH_FUNCTIONS_LSX_H
+
+/* The sin and cos functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
+
+EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(tanh, Packet4f)
+
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_LSX_H
diff --git a/Eigen/src/Core/arch/LSX/PacketMath.h b/Eigen/src/Core/arch/LSX/PacketMath.h
new file mode 100644
index 0000000..87232aa
--- /dev/null
+++ b/Eigen/src/Core/arch/LSX/PacketMath.h
@@ -0,0 +1,2866 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
+// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_LSX_H
+#define EIGEN_PACKET_MATH_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_LOONGARCH64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+typedef __m128 Packet4f;
+typedef __m128d Packet2d;
+
+typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
+typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
+typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
+typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
+typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
+typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
+typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
+
+template <>
+struct is_arithmetic<__m128> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet16c> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8s> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet4i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet2l> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet16uc> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet8us> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet4ui> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet2ul> {
+  enum { value = false };
+};
+
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return (Packet4f)__lsx_vld(from, 0);
+}
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
+  return res;
+}
+
+template <bool interleave>
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
+  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
+}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
+  return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
+  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
+}
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return (Packet2d)__lsx_vld(from, 0);
+}
+
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint8_t> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
+  typedef Packet4ui type;
+  typedef Packet4ui half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasSign = 0,
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasRsqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasSign = 0,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasRsqrt = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16c> {
+  typedef int8_t type;
+  typedef Packet16c half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16uc> {
+  typedef uint8_t type;
+  typedef Packet16uc half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8us> {
+  typedef uint16_t type;
+  typedef Packet8us half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4ui> {
+  typedef uint32_t type;
+  typedef Packet4ui half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2ul> {
+  typedef uint64_t type;
+  typedef Packet2ul half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
+  return __lsx_vreplgr2vr_b(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
+  return __lsx_vreplgr2vr_h(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  return __lsx_vreplgr2vr_w(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  return __lsx_vreplgr2vr_d(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
+  return __lsx_vreplgr2vr_b(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
+  return __lsx_vreplgr2vr_h(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return __lsx_vreplgr2vr_w(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
+  return __lsx_vreplgr2vr_d(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  Packet4f v = {from, from, from, from};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  Packet2d v = {from, from};
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
+  const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
+  const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  const int32_t countdown[] = {0, 1, 2, 3};
+  return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+  const int64_t countdown[] = {0, 1};
+  return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
+  const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
+  const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  const uint32_t countdown[] = {0, 1, 2, 3};
+  return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
+  const uint64_t countdown[] = {0, 1};
+  return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
+  return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  static const Packet2d countdown = {0.0f, 1.0f};
+  return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vadd_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vadd_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vadd_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vadd_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vadd_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vadd_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vadd_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vadd_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfadd_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vsub_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vsub_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vsub_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vsub_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vsub_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vsub_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vsub_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vsub_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfsub_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  const Packet4f mask =
+      make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
+  return padd(a, pxor(mask, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
+  return padd(a, pxor(mask, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
+                                numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
+  return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  Packet2d mask =
+      make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
+  return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+  return __lsx_vneg_b(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return __lsx_vneg_h(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return __lsx_vneg_w(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+  return __lsx_vneg_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfmul_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfmul_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmul_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmul_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmul_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmul_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmul_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmul_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmul_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfdiv_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfdiv_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vdiv_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vdiv_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vdiv_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vdiv_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vdiv_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vdiv_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfmadd_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfmadd_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfmsub_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfmsub_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfnmsub_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfnmsub_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfnmadd_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfnmadd_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmadd_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmadd_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmadd_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmadd_d(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
+  return __lsx_vmadd_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return __lsx_vmadd_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+  return __lsx_vmadd_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
+  return __lsx_vmadd_d(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vand_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vor_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vxor_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vandn_v(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_cle_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_cle_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vsle_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vsle_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vsle_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vsle_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vsle_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vsle_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vsle_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vsle_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_clt_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_clt_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vslt_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vslt_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vslt_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vslt_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vslt_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vslt_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vslt_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vslt_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_sult_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_sult_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_seq_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_seq_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vseq_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vseq_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vseq_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vseq_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vseq_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vseq_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vseq_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vseq_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmin_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmin_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmin_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmin_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmin_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmin_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmin_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmin_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmax_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmax_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmax_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmax_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmax_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmax_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmax_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmax_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
+  Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
+  Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
+  Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
+  Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
+  return __lsx_vsrai_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
+  return __lsx_vsrai_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return __lsx_vsrai_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  return __lsx_vsrai_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
+  return __lsx_vslli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
+  return __lsx_vslli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return __lsx_vslli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return __lsx_vslli_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
+  return __lsx_vslli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
+  return __lsx_vslli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+  return __lsx_vslli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
+  return __lsx_vslli_d((__m128i)a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return __lsx_vabsd_b(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return __lsx_vabsd_h(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return __lsx_vabsd_w(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+  return __lsx_vabsd_d(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  float f0 = from[0], f1 = from[1];
+  return make_packet4f(f0, f0, f1, f1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return pset1<Packet2d>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
+  Packet16c tmp = pload<Packet16c>(from);
+  return __lsx_vilvl_b(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
+  Packet8s tmp = pload<Packet8s>(from);
+  return __lsx_vilvl_h(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  Packet4i tmp = pload<Packet4i>(from);
+  return __lsx_vilvl_w(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+  return pset1<Packet2l>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
+  Packet16uc tmp = pload<Packet16uc>(from);
+  return __lsx_vilvl_b(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
+  Packet8us tmp = pload<Packet8us>(from);
+  return __lsx_vilvl_h(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+  Packet4ui tmp = pload<Packet4ui>(from);
+  return __lsx_vilvl_w(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
+  return pset1<Packet2ul>(from[0]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
+  return v;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  Packet2d v = {from[0], from[stride]};
+  return v;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
+  int8_t v[16] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  v[8] = from[8 * stride];
+  v[9] = from[9 * stride];
+  v[10] = from[10 * stride];
+  v[11] = from[11 * stride];
+  v[12] = from[12 * stride];
+  v[13] = from[13 * stride];
+  v[14] = from[14 * stride];
+  v[15] = from[15 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
+  int16_t v[8] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  int32_t v[4] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
+  int64_t v[2] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
+  uint8_t v[16] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  v[8] = from[8 * stride];
+  v[9] = from[9 * stride];
+  v[10] = from[10 * stride];
+  v[11] = from[11 * stride];
+  v[12] = from[12 * stride];
+  v[13] = from[13 * stride];
+  v[14] = from[14 * stride];
+  v[15] = from[15 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
+  uint16_t v[8] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
+  uint32_t v[4] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
+  uint64_t v[2] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  return __lsx_vld(v, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  __lsx_vstelm_w(from, to, 0, 0);
+  __lsx_vstelm_w(from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w(from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w(from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  __lsx_vstelm_d(from, to, 0, 0);
+  __lsx_vstelm_d(from, to + stride, 0, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
+                                                                       Index stride) {
+  __lsx_vstelm_b((__m128i)from, to, 0, 0);
+  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
+  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
+  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
+  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
+  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
+  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
+  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
+  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
+  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
+                                                                       Index stride) {
+  __lsx_vstelm_h((__m128i)from, to, 0, 0);
+  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                                       Index stride) {
+  __lsx_vstelm_w((__m128i)from, to, 0, 0);
+  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
+                                                                       Index stride) {
+  __lsx_vstelm_d((__m128i)from, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
+                                                                         Index stride) {
+  __lsx_vstelm_b((__m128i)from, to, 0, 0);
+  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
+  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
+  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
+  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
+  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
+  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
+  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
+  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
+  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
+                                                                         Index stride) {
+  __lsx_vstelm_h((__m128i)from, to, 0, 0);
+  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
+                                                                         Index stride) {
+  __lsx_vstelm_w((__m128i)from, to, 0, 0);
+  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
+                                                                         Index stride) {
+  __lsx_vstelm_d((__m128i)from, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  float v;
+  __lsx_vstelm_w(a, &v, 0, 0);
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  double v;
+  __lsx_vstelm_d(a, &v, 0, 0);
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
+  return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
+  return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  return __lsx_vpickve2gr_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  return __lsx_vpickve2gr_d((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
+  return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
+  return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return __lsx_vpickve2gr_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
+  return __lsx_vpickve2gr_du((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
+  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
+  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
+  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
+  return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
+  Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
+  Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
+  return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
+  Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
+  return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  Packet2l tmp = __lsx_vhaddw_d_w(a, a);
+  return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
+  Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
+  Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
+  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
+  Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
+  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+  Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
+  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
+  return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
+  Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
+  Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
+  return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
+  Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
+  return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
+  return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
+  return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
+  Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
+  Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
+  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
+  Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
+  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+  Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
+  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
+  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
+  return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(__lsx_vfmin_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+  Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+  Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
+  return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+  Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+  Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+  Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
+  return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
+  return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(__lsx_vfmax_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+  Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+  Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
+  return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+  Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+  Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+  Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
+  return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return __lsx_vfsqrt_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  return __lsx_vfsqrt_d(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
+  Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
+  kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
+  kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
+  kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
+  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
+  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
+  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
+  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
+  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+  __m128i s8 = __lsx_vilvl_h(ta, t8);
+  __m128i s9 = __lsx_vilvh_h(ta, t8);
+  __m128i sa = __lsx_vilvl_h(tb, t9);
+  __m128i sb = __lsx_vilvh_h(tb, t9);
+  __m128i sc = __lsx_vilvl_h(te, tc);
+  __m128i sd = __lsx_vilvh_h(te, tc);
+  __m128i se = __lsx_vilvl_h(tf, td);
+  __m128i sf = __lsx_vilvh_h(tf, td);
+
+  __m128i u0 = __lsx_vilvl_w(s4, s0);
+  __m128i u1 = __lsx_vilvh_w(s4, s0);
+  __m128i u2 = __lsx_vilvl_w(s5, s1);
+  __m128i u3 = __lsx_vilvh_w(s5, s1);
+  __m128i u4 = __lsx_vilvl_w(s6, s2);
+  __m128i u5 = __lsx_vilvh_w(s6, s2);
+  __m128i u6 = __lsx_vilvl_w(s7, s3);
+  __m128i u7 = __lsx_vilvh_w(s7, s3);
+  __m128i u8 = __lsx_vilvl_w(sc, s8);
+  __m128i u9 = __lsx_vilvh_w(sc, s8);
+  __m128i ua = __lsx_vilvl_w(sd, s9);
+  __m128i ub = __lsx_vilvh_w(sd, s9);
+  __m128i uc = __lsx_vilvl_w(se, sa);
+  __m128i ud = __lsx_vilvh_w(se, sa);
+  __m128i ue = __lsx_vilvl_w(sf, sb);
+  __m128i uf = __lsx_vilvh_w(sf, sb);
+
+  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
+  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
+  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
+  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
+  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
+  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
+  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
+  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
+  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
+  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
+  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
+  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
+  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
+  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
+  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
+  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_w(t2, t0);
+  __m128i s1 = __lsx_vilvh_w(t2, t0);
+  __m128i s2 = __lsx_vilvl_w(t3, t1);
+  __m128i s3 = __lsx_vilvh_w(t3, t1);
+  __m128i s4 = __lsx_vilvl_w(t6, t4);
+  __m128i s5 = __lsx_vilvh_w(t6, t4);
+  __m128i s6 = __lsx_vilvl_w(t7, t5);
+  __m128i s7 = __lsx_vilvh_w(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
+  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
+  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
+  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
+  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
+  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
+  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
+  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
+  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
+  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
+  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+  __m128i s8 = __lsx_vilvl_h(ta, t8);
+  __m128i s9 = __lsx_vilvh_h(ta, t8);
+  __m128i sa = __lsx_vilvl_h(tb, t9);
+  __m128i sb = __lsx_vilvh_h(tb, t9);
+  __m128i sc = __lsx_vilvl_h(te, tc);
+  __m128i sd = __lsx_vilvh_h(te, tc);
+  __m128i se = __lsx_vilvl_h(tf, td);
+  __m128i sf = __lsx_vilvh_h(tf, td);
+
+  __m128i u0 = __lsx_vilvl_w(s4, s0);
+  __m128i u1 = __lsx_vilvh_w(s4, s0);
+  __m128i u2 = __lsx_vilvl_w(s5, s1);
+  __m128i u3 = __lsx_vilvh_w(s5, s1);
+  __m128i u4 = __lsx_vilvl_w(s6, s2);
+  __m128i u5 = __lsx_vilvh_w(s6, s2);
+  __m128i u6 = __lsx_vilvl_w(s7, s3);
+  __m128i u7 = __lsx_vilvh_w(s7, s3);
+  __m128i u8 = __lsx_vilvl_w(sc, s8);
+  __m128i u9 = __lsx_vilvh_w(sc, s8);
+  __m128i ua = __lsx_vilvl_w(sd, s9);
+  __m128i ub = __lsx_vilvh_w(sd, s9);
+  __m128i uc = __lsx_vilvl_w(se, sa);
+  __m128i ud = __lsx_vilvh_w(se, sa);
+  __m128i ue = __lsx_vilvl_w(sf, sb);
+  __m128i uf = __lsx_vilvh_w(sf, sb);
+
+  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
+  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
+  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
+  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
+  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
+  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
+  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
+  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
+  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
+  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
+  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
+  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
+  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
+  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
+  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
+  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_w(t2, t0);
+  __m128i s1 = __lsx_vilvh_w(t2, t0);
+  __m128i s2 = __lsx_vilvl_w(t3, t1);
+  __m128i s3 = __lsx_vilvh_w(t3, t1);
+  __m128i s4 = __lsx_vilvl_w(t6, t4);
+  __m128i s5 = __lsx_vilvh_w(t6, t4);
+  __m128i s6 = __lsx_vilvl_w(t7, t5);
+  __m128i s7 = __lsx_vilvh_w(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
+  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
+  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
+  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
+  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
+  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  return __lsx_vfrsqrt_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  return __lsx_vfrsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
+  return __lsx_vfrintrm_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
+  return __lsx_vfrintrm_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
+  return __lsx_vfrintrp_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
+  return __lsx_vfrintrp_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
+  const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
+  const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
+  return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
+  int8_t tmp[16] = {*from,       *from,       *from,       *from,       *(from + 1), *(from + 1),
+                    *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
+                    *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
+  uint8_t tmp[16] = {*from,       *from,       *from,       *from,       *(from + 1), *(from + 1),
+                     *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
+                     *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
+  int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
+  uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
+  int32_t tmp[4] = {*from, *from, *from, *from};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
+  uint32_t tmp[4] = {*from, *from, *from, *from};
+  return __lsx_vld(tmp, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmsub_b(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmsub_h(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmsub_w(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmsub_d(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmadd_b(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmadd_h(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmadd_w(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmadd_d(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmsub_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmsub_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmsub_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmsub_d(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
+  return pexp_float(_x);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
+  return pexp_double(_x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
+  Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmin<Packet4f>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmax<Packet4f>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
+  return (__m128)__lsx_vldrepl_w(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return (__m128)__lsx_vsrai_w((__m128i)a, 31);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrintrne_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrintrz_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrecip_s(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
+  Packet2d v = {0.0, 0.0};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmin<Packet2d>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmax<Packet2d>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return __lsx_vfrintrne_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return __lsx_vfrintrz_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  Packet16c v = psub(a, b);
+  return pabs(v);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  Packet8s v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  Packet4i v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vdiv_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  Packet16uc v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
+                                                         const Packet16uc& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0808080808080808, 0x0808080808080808};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
+    add = __lsx_vsrli_b(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  Packet8us v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0080008000800080, 0x0080008000800080};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
+    add = __lsx_vsrli_h(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  Packet4ui v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0000800000008000, 0x0000800000008000};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
+    add = __lsx_vsrli_w(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+#endif
diff --git a/Eigen/src/Core/arch/LSX/TypeCasting.h b/Eigen/src/Core/arch/LSX/TypeCasting.h
new file mode 100644
index 0000000..cda8680
--- /dev/null
+++ b/Eigen/src/Core/arch/LSX/TypeCasting.h
@@ -0,0 +1,526 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
+// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_LSX_H
+#define EIGEN_TYPE_CASTING_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return (__m128)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return (__m128)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return (__m128i)a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  Packet2d tmp = __lsx_vfcvtl_d_s(a);
+  return __lsx_vftint_l_d(tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  Packet2d tmp = __lsx_vfcvtl_d_s(a);
+  return __lsx_vftint_lu_d(tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return __lsx_vftint_w_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return __lsx_vftint_wu_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(c), __lsx_vftint_w_s(d), 0);
+  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                           const Packet4f& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(c), __lsx_vftint_wu_s(d), 0);
+  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vffint_s_w(tmp2);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vsllwil_d_w((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  return __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  return (Packet4ui)__lsx_vsllwil_w_h((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
+  return __lsx_vsllwil_h_b((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
+  return (Packet8us)__lsx_vsllwil_h_b((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vffint_s_wu(tmp2);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vsllwil_du_wu((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  return __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
+  return __lsx_vsllwil_hu_bu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
+  return (Packet8s)__lsx_vsllwil_hu_bu((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vffint_s_w(tmp1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vsllwil_d_w((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
+  return __lsx_vsllwil_w_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
+  return (Packet4ui)__lsx_vsllwil_w_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
+  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vffint_s_wu(tmp1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vsllwil_du_wu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
+  return __lsx_vsllwil_wu_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
+  return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
+  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return __lsx_vffint_s_w(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
+  return __lsx_vsllwil_d_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
+  return (Packet8us)__lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                         const Packet4i& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                           const Packet4i& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
+  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return __lsx_vffint_s_wu(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
+  return __lsx_vsllwil_du_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
+  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                            const Packet4ui& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                          const Packet4ui& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
+  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vffint_s_w(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
+  return (Packet4ui)__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
+  return (Packet8us)__lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                         const Packet2l& g, const Packet2l& h) {
+  const Packet8s abcd = pcast<Packet2l, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2l, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                           const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                           const Packet2l& g, const Packet2l& h) {
+  const Packet8us abcd = pcast<Packet2l, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2l, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vffint_s_wu(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
+  return (Packet4i)__lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
+  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                            const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                            const Packet2ul& g, const Packet2ul& h) {
+  const Packet8s abcd = pcast<Packet2ul, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2ul, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                          const Packet2ul& g, const Packet2ul& h) {
+  const Packet8us abcd = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2ul, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfcvt_s_d(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return __lsx_vftint_l_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
+  return __lsx_vftint_lu_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(c), __lsx_vftint_l_d(d), 0);
+  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(c), __lsx_vftint_lu_d(d), 0);
+  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                         const Packet2d& g, const Packet2d& h) {
+  const Packet8s abcd = pcast<Packet2d, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2d, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                           const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                           const Packet2d& g, const Packet2d& h) {
+  const Packet8us abcd = pcast<Packet2d, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2d, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  return __lsx_vfcvtl_d_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp2, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp2, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
+  Packet4i tmp = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
+  Packet4ui tmp = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  return __lsx_vffint_d_l(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
+  return __lsx_vffint_d_lu(a);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_LSX_H
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index c1843c3..81da24f 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -91,9 +91,6 @@ struct packet_traits<float> : default_packet_traits {
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasBlend = 1
   };
 };
@@ -859,9 +856,6 @@ struct packet_traits<double> : default_packet_traits {
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasBlend = 1
   };
 };
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 5257c03..f3f6a1a 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -73,30 +73,13 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet1cf> {
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  typedef Packet2f as_real;
-  enum {
-    size = 1,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet1cf> : neon_unpacket_default<Packet1cf, std::complex<float>> {
+  using as_real = Packet2f;
 };
 template <>
-struct unpacket_traits<Packet2cf> {
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  typedef Packet4f as_real;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2cf> : neon_unpacket_default<Packet2cf, std::complex<float>> {
+  using half = Packet1cf;
+  using as_real = Packet4f;
 };
 
 template <>
@@ -108,6 +91,16 @@ EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f, Packet2cf>(const Packet2f& a) {
   return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a))));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet1cf pzero(const Packet1cf& /*a*/) {
+  return Packet1cf(vdup_n_f32(0.0f));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /*a*/) {
+  return Packet2cf(vdupq_n_f32(0.0f));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from) {
   return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from)));
@@ -156,6 +149,20 @@ EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmadd<Packet1cf>(const Packet1cf& a, const Packet1cf& b, const Packet1cf& c) {
+  Packet1cf result;
+  result.v = vcmla_f32(c.v, a.v, b.v);
+  result.v = vcmla_rot90_f32(result.v, a.v, b.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
 template <>
 EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
   Packet2f v1, v2;
@@ -175,6 +182,22 @@ EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1c
   // Add and return the result
   return Packet1cf(vadd_f32(v1, v2));
 }
+#endif
+
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  Packet2cf result;
+  result.v = vcmlaq_f32(c.v, a.v, b.v);
+  result.v = vcmlaq_rot90_f32(result.v, a.v, b.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
 template <>
 EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   Packet4f v1, v2;
@@ -194,6 +217,7 @@ EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2c
   // Add and return the result
   return Packet2cf(vaddq_f32(v1, v2));
 }
+#endif
 
 template <>
 EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) {
@@ -256,10 +280,12 @@ EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet1cf>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2cf>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from)));
 }
 
@@ -283,10 +309,12 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* fro
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet1cf>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2cf>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v);
 }
 
@@ -461,13 +489,10 @@ EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
-// See bug 1325, clang fails to call vld1q_u64.
-#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CPE
-static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
-#else
-const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
-static uint64x2_t p2ul_CONJ_XOR = vld1q_u64(p2ul_conj_XOR_DATA);
-#endif
+inline uint64x2_t p2ul_CONJ_XOR() {
+  static const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
+  return vld1q_u64(p2ul_conj_XOR_DATA);
+}
 
 struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
@@ -500,21 +525,13 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet1cd> {
-  typedef std::complex<double> type;
-  typedef Packet1cd half;
-  typedef Packet2d as_real;
-  enum {
-    size = 1,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet1cd> : neon_unpacket_default<Packet1cd, std::complex<double>> {
+  using as_real = Packet2d;
 };
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet1cd>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from)));
 }
 
@@ -523,6 +540,11 @@ EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from
   EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /*a*/) {
+  return Packet1cd(vdupq_n_f64(0.0));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
   /* here we really have to use unaligned loads :( */
@@ -546,9 +568,23 @@ EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR)));
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR())));
+}
+
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd<Packet1cd>(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  Packet1cd result;
+  result.v = vcmlaq_f64(c.v, a.v, b.v);
+  result.v = vcmlaq_rot90_f64(result.v, a.v, b.v);
+  return result;
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
 template <>
 EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   Packet2d v1, v2;
@@ -562,12 +598,13 @@ EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1c
   // Multiply the imag a with b
   v2 = vmulq_f64(v2, b.v);
   // Conjugate v2
-  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = preverse<Packet2d>(v2);
   // Add and return the result
   return Packet1cd(vaddq_f64(v1, v2));
 }
+#endif
 
 template <>
 EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
@@ -608,6 +645,7 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet1cd>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v);
 }
 
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 3d2e7bd..0046e01 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -27,7 +27,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet4hf ptanh<Packet4hf>(const Packet4hf
 
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
-  // Convert each 4 halfs to float, call the float ptanh, and then convert back.
+  // Convert each 4 half types to float, call the float ptanh, and then convert back.
   return vcombine_f16(vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
                       vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
 }
@@ -37,6 +37,7 @@ BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp2)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
 
 template <>
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 71e5f5f..bea50a3 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -103,7 +103,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
   return res;
 }
 
-// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
+// functionally equivalent to _mm_shuffle_ps in SSE when interleave
 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
 // to enable a shared implementation for fast inversion of matrices of size 4.
@@ -196,12 +196,7 @@ struct packet_traits<float> : default_packet_traits {
     HasConj = 1,
     HasSetLinear = 1,
     HasBlend = 0,
-
     HasDiv = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
-
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasACos = 1,
@@ -210,10 +205,13 @@ struct packet_traits<float> : default_packet_traits {
     HasATanh = 1,
     HasLog = 1,
     HasExp = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
     HasNdtri = 0
   };
@@ -440,224 +438,84 @@ struct packet_traits<uint64_t> : default_packet_traits {
   };
 };
 
-template <>
-struct unpacket_traits<Packet2f> {
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet2i integer_packet;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+template <typename Packet, typename Scalar>
+struct neon_unpacket_default {
+  using type = Scalar;
+  using half = Packet;
+  static constexpr int size = sizeof(Packet) / sizeof(Scalar);
+  static constexpr int alignment = sizeof(Packet);
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
 };
+
 template <>
-struct unpacket_traits<Packet4f> {
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet4i integer_packet;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
+  using integer_packet = Packet2i;
 };
 template <>
-struct unpacket_traits<Packet4c> {
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
+  using half = Packet2f;
+  using integer_packet = Packet4i;
 };
 template <>
-struct unpacket_traits<Packet8c> {
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
 template <>
-struct unpacket_traits<Packet16c> {
-  typedef int8_t type;
-  typedef Packet8c half;
-  enum {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
+  using half = Packet4c;
 };
 template <>
-struct unpacket_traits<Packet4uc> {
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
+  using half = Packet8c;
 };
 template <>
-struct unpacket_traits<Packet8uc> {
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
 template <>
-struct unpacket_traits<Packet16uc> {
-  typedef uint8_t type;
-  typedef Packet8uc half;
-  enum {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
+  using half = Packet4uc;
 };
 template <>
-struct unpacket_traits<Packet4s> {
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
+  using half = Packet8uc;
 };
 template <>
-struct unpacket_traits<Packet8s> {
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
 template <>
-struct unpacket_traits<Packet4us> {
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
+  using half = Packet4s;
 };
 template <>
-struct unpacket_traits<Packet8us> {
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
 template <>
-struct unpacket_traits<Packet2i> {
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
+  using half = Packet4us;
 };
 template <>
-struct unpacket_traits<Packet4i> {
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
 template <>
-struct unpacket_traits<Packet2ui> {
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
+  using half = Packet2i;
 };
 template <>
-struct unpacket_traits<Packet4ui> {
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
 template <>
-struct unpacket_traits<Packet2l> {
-  typedef int64_t type;
-  typedef Packet2l half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
+  using half = Packet2ui;
 };
 template <>
-struct unpacket_traits<Packet2ul> {
-  typedef uint64_t type;
-  typedef Packet2ul half;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
+template <>
+struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
+  return vdup_n_f32(0.0f);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
+  return vdupq_n_f32(0.0f);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
@@ -1280,6 +1138,14 @@ template <>
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
   return vfma_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfms_f32(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
@@ -1289,7 +1155,31 @@ template <>
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
   return vmla_f32(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmls_f32(c, a, b);
+}
 #endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pmadd(a, b, c));
+}
 
 // No FMA instruction for int, so use MLA unconditionally.
 template <>
@@ -2378,10 +2268,12 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2f>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4f>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
 }
 template <>
@@ -2392,10 +2284,12 @@ EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8c>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16c>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
 }
 template <>
@@ -2406,50 +2300,62 @@ EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8uc>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16uc>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4s>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8s>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4us>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8us>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2i>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4i>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ui>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4ui>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2l>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ul>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
 }
 
@@ -2674,10 +2580,12 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
 
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2f>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4f>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
 }
 template <>
@@ -2686,10 +2594,12 @@ EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8c>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16c>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
 }
 template <>
@@ -2698,50 +2608,62 @@ EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8uc>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16uc>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4s>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8s>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4us>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8us>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2i>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4i>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ui>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4ui>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2l>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ul>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
 }
 
@@ -4470,76 +4392,25 @@ EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
   return vrndpq_f32(a);
 }
 
-#else
-
-template <>
-EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
-  // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
-  const Packet4f abs_a = pabs(a);
-  Packet4f r = padd(abs_a, limit);
-  // Don't compile-away addition and subtraction.
-  EIGEN_OPTIMIZATION_BARRIER(r);
-  r = psub(r, limit);
-  // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
-  return r;
-}
-
 template <>
-EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
-  // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1 << 23));
-  const Packet2f abs_a = pabs(a);
-  Packet2f r = padd(abs_a, limit);
-  // Don't compile-away addition and subtraction.
-  EIGEN_OPTIMIZATION_BARRIER(r);
-  r = psub(r, limit);
-  // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
-  return r;
+EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
+  return vrnda_f32(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp = print<Packet4f>(a);
-  // If greater, subtract one.
-  Packet4f mask = pcmp_lt(a, tmp);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  return vrndaq_f32(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
-  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
-  Packet2f tmp = print<Packet2f>(a);
-  // If greater, subtract one.
-  Packet2f mask = pcmp_lt(a, tmp);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
+EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
+  return vrnd_f32(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp = print<Packet4f>(a);
-  // If smaller, add one.
-  Packet4f mask = pcmp_lt(tmp, a);
-  mask = pand(mask, cst_1);
-  return padd(tmp, mask);
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return vrndq_f32(a);
 }
-
-template <>
-EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
-  const Packet2f cst_1 = pset1<Packet2f>(1.0);
-  Packet2f tmp = print<Packet2f>(a);
-  // If smaller, add one.
-  Packet2f mask = pcmp_lt(tmp, a);
-  mask = pand(mask, cst_1);
-  return padd(tmp, mask);
-}
-
 #endif
 
 /**
@@ -4800,10 +4671,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasSetLinear = 1,
     HasBlend = 0,
     HasDiv = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
-
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
@@ -4817,17 +4684,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet4bf> {
-  typedef bfloat16 type;
-  typedef Packet4bf half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
 
 namespace detail {
 template <>
@@ -4882,6 +4739,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4bf>::alignment);
   return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
@@ -4892,6 +4750,7 @@ EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
 
 template <>
 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4bf>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
 }
 
@@ -4983,6 +4842,16 @@ EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
   return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
   return a;
@@ -5003,6 +4872,26 @@ EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4b
   return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
@@ -5079,7 +4968,7 @@ EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
 
 //---------- double ----------
 
-// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
 // Confirmed at least with __apple_build_version__ = 6000054.
 #if EIGEN_COMP_CLANGAPPLE
 // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
@@ -5125,7 +5014,7 @@ typedef float64x1_t Packet1d;
 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
 #endif
 
-// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
 // for fast inversion of matrices of size 4.
 EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
@@ -5168,38 +5057,35 @@ struct packet_traits<double> : default_packet_traits {
     HasBlend = 0,
 
     HasDiv = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
 
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
     HasExp = 1,
     HasLog = 1,
+    HasPow = 1,
     HasATan = 1,
+    HasATanh = 1,
 #endif
-    HasSin = 0,
-    HasCos = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh = 0,
-    HasErf = 0
+    HasCbrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
   };
 };
 
 template <>
-struct unpacket_traits<Packet2d> {
-  typedef double type;
-  typedef Packet2d half;
-  typedef Packet2l integer_packet;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
+  using integer_packet = Packet2l;
 };
 
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
+  return vdupq_n_f64(0.0);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   return vdupq_n_f64(from);
@@ -5255,13 +5141,28 @@ template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vfmaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmsq_f64(c, a, b);
+}
 #else
 template <>
 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
   return vmlaq_f64(c, a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlsq_f64(c, a, b);
+}
 #endif
-
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pmadd(a, b, c));
+}
 template <>
 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
   return vminq_f64(a, b);
@@ -5339,6 +5240,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2d>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
 }
 
@@ -5353,6 +5255,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2d>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
 }
 
@@ -5460,6 +5363,16 @@ EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
   return vrndpq_f64(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  return vrndaq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return vrndq_f64(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   return pldexp_generic(a, exponent);
@@ -5521,9 +5434,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasInsert = 1,
     HasReduxp = 1,
     HasDiv = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1,
     HasSin = 0,
     HasCos = 0,
     HasLog = 0,
@@ -5538,29 +5448,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
 };
 
 template <>
-struct unpacket_traits<Packet4hf> {
-  typedef Eigen::half type;
-  typedef Packet4hf half;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-
+struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
 template <>
-struct unpacket_traits<Packet8hf> {
-  typedef Eigen::half type;
-  typedef Packet4hf half;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
+struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
+  using half = Packet4hf;
 };
 
 template <>
@@ -5662,6 +5553,36 @@ EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, cons
   return vfma_f16(c, a, b);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmsq_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfms_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   return vminq_f16(a, b);
@@ -5791,6 +5712,26 @@ EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
   return vrndp_f16(a);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
+  return vrndaq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
+  return vrnda_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
+  return vrndq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
+  return vrnd_f16(a);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
   return vsqrtq_f16(a);
@@ -5843,11 +5784,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8hf>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
+  EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4hf>::alignment);
   EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
 }
 
@@ -5923,11 +5866,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a,
 
 template <>
 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8hf>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4hf>::alignment);
   EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
 }
 
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 0e70f03..f79da7b 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -89,19 +89,25 @@ EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
 #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
-                                 _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-  //   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-  //                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-  //                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  __m128 tmp1 = _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = _mm_moveldup_ps(a.v);
 #else
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000));
-  return Packet2cf(
-      _mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                 _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+  __m128 tmp1 = _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = vec4f_swizzle1(a.v, 0, 0, 2, 2);
 #endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128 result = _mm_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128 result = _mm_addsub_ps(_mm_mul_ps(tmp2, b.v), tmp1);
+#else
+  const __m128 mask = _mm_setr_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+  __m128 result = _mm_add_ps(_mm_mul_ps(tmp2, b.v), _mm_xor_ps(tmp1, mask));
+#endif
+#endif
+  return Packet2cf(result);
 }
 
 template <>
@@ -127,11 +133,11 @@ EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packe
 
 template <>
 EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps(&numext::real_ref(*from)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(_mm_loadu_ps(&numext::real_ref(*from)));
 }
 
 template <>
@@ -148,11 +154,11 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* fro
 
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(&numext::real_ref(*to), from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(&numext::real_ref(*to), from.v);
 }
 
 template <>
@@ -277,15 +283,24 @@ EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
+  __m128d tmp1 = _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), vec2d_swizzle1(b.v, 1, 0));
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128d tmp2 = _mm_movedup_pd(a.v);
+#else
+  __m128d tmp2 = _mm_unpacklo_pd(a.v, a.v);
+#endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128d result = _mm_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
 #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
-                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0))));
+  __m128d result = _mm_addsub_pd(_mm_mul_pd(tmp2, b.v), tmp1);
 #else
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
-  return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask)));
+  const __m128d mask = _mm_setr_pd(-0.0, 0.0);
+  __m128d result = _mm_add_pd(_mm_mul_pd(tmp2, b.v), _mm_xor_pd(tmp1, mask));
 #endif
+#endif
+  return Packet1cd(result);
 }
 
 template <>
@@ -312,11 +327,11 @@ EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packe
 // FIXME force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(_mm_loadu_pd((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd
@@ -332,11 +347,11 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* fr
 // FIXME force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
-  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd((double*)to, from.v);
 }
 
 template <>
@@ -430,6 +445,58 @@ EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
   return pexp_complex<Packet2cf>(a);
 }
 
+#ifdef EIGEN_VECTORIZE_FMA
+// std::complex<float>
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  __m128 a_odd = _mm_movehdup_ps(a.v);
+  __m128 a_even = _mm_moveldup_ps(a.v);
+  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m128 result = _mm_fmaddsub_ps(a_even, b.v, _mm_fmaddsub_ps(a_odd, b_swap, c.v));
+  return Packet2cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  __m128 a_odd = _mm_movehdup_ps(a.v);
+  __m128 a_even = _mm_moveldup_ps(a.v);
+  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m128 result = _mm_fmaddsub_ps(a_even, b.v, _mm_fmsubadd_ps(a_odd, b_swap, c.v));
+  return Packet2cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+// std::complex<double>
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
+  __m128d a_even = _mm_movedup_pd(a.v);
+  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
+  __m128d result = _mm_fmaddsub_pd(a_even, b.v, _mm_fmaddsub_pd(a_odd, b_swap, c.v));
+  return Packet1cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
+  __m128d a_even = _mm_movedup_pd(a.v);
+  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
+  __m128d result = _mm_fmaddsub_pd(a_even, b.v, _mm_fmsubadd_pd(a_odd, b_swap, c.v));
+  return Packet1cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  return pnegate(pmadd(a, b, c));
+}
+#endif
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 008109a..e0119dd 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -192,18 +192,15 @@ struct packet_traits<float> : default_packet_traits {
     HasExpm1 = 1,
     HasNdtri = 1,
     HasExp = 1,
+    HasPow = 1,
     HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasBlend = 1,
-    HasCeil = 1,
-    HasFloor = 1,
-#ifdef EIGEN_VECTORIZE_SSE4_1
-    HasRound = 1,
-#endif
-    HasRint = 1,
     HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
 };
@@ -218,18 +215,20 @@ struct packet_traits<double> : default_packet_traits {
 
     HasCmp = 1,
     HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
     HasLog = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasExp = 1,
+    HasPow = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasCbrt = 1,
     HasATan = 1,
-    HasBlend = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-#ifdef EIGEN_VECTORIZE_SSE4_1
-    HasRound = 1,
-#endif
-    HasRint = 1
+    HasATanh = 1,
+    HasBlend = 1
   };
 };
 template <>
@@ -288,7 +287,7 @@ struct packet_traits<bool> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
 
-    HasCmp = 1,  // note -- only pcmp_eq is defined
+    HasCmp = 1,
     HasShift = 0,
     HasAbs = 0,
     HasAbs2 = 0,
@@ -296,6 +295,7 @@ struct packet_traits<bool> : default_packet_traits {
     HasMax = 0,
     HasConj = 0,
     HasSqrt = 1,
+    HasNegate = 0,
     HasSign = 0  // Don't try to vectorize psign<bool> = identity.
   };
 };
@@ -601,11 +601,6 @@ EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
   return psub(pzero(a), a);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) {
-  return a;
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
   return a;
@@ -888,7 +883,14 @@ template <>
 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
   return _mm_andnot_si128(b, a);
 }
-
+template <>
+EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(a, b);
+}
 template <>
 EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
   return _mm_cmple_ps(a, b);
@@ -932,7 +934,11 @@ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
+#else
   return por(pcmp_lt(a, b), pcmp_eq(a, b));
+#endif
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
@@ -1141,7 +1147,7 @@ EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet&
   return pselect<Packet>(not_nan_mask_a, m, a);
 }
 
-// Add specializations for min/max with prescribed NaN progation.
+// Add specializations for min/max with prescribed NaN propagation.
 template <>
 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
@@ -1242,13 +1248,13 @@ EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
-  return _mm_and_ps(a, mask);
+  const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
+  return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
-  return _mm_and_pd(a, mask);
+  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
+  return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
@@ -1311,73 +1317,14 @@ template <>
 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
   return _mm_floor_pd(a);
 }
-#else
-template <>
-EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
-  // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
-  const Packet4f abs_a = pabs(a);
-  Packet4f r = padd(abs_a, limit);
-  // Don't compile-away addition and subtraction.
-  EIGEN_OPTIMIZATION_BARRIER(r);
-  r = psub(r, limit);
-  // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
-  return r;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
-  // Adds and subtracts signum(a) * 2^52 to force rounding.
-  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull << 52));
-  const Packet2d abs_a = pabs(a);
-  Packet2d r = padd(abs_a, limit);
-  // Don't compile-away addition and subtraction.
-  EIGEN_OPTIMIZATION_BARRIER(r);
-  r = psub(r, limit);
-  // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
-  return r;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp = print<Packet4f>(a);
-  // If greater, subtract one.
-  Packet4f mask = _mm_cmpgt_ps(tmp, a);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
-  const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp = print<Packet2d>(a);
-  // If greater, subtract one.
-  Packet2d mask = _mm_cmpgt_pd(tmp, a);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
-}
 
 template <>
-EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp = print<Packet4f>(a);
-  // If smaller, add one.
-  Packet4f mask = _mm_cmplt_ps(tmp, a);
-  mask = pand(mask, cst_1);
-  return padd(tmp, mask);
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return _mm_round_ps(a, _MM_FROUND_TRUNC);
 }
-
 template <>
-EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
-  const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp = print<Packet2d>(a);
-  // If smaller, add one.
-  Packet2d mask = _mm_cmplt_pd(tmp, a);
-  mask = pand(mask, cst_1);
-  return padd(tmp, mask);
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return _mm_round_pd(a, _MM_FROUND_TRUNC);
 }
 #endif
 
@@ -1762,10 +1709,24 @@ EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packe
 }
 template <>
 EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
-  to[4 * stride * 0] = _mm_cvtsi128_si32(from);
-  to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[4 * stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+  EIGEN_ALIGN16 bool tmp[16];
+  pstore(tmp, from);
+  to[stride * 0] = tmp[0];
+  to[stride * 1] = tmp[1];
+  to[stride * 2] = tmp[2];
+  to[stride * 3] = tmp[3];
+  to[stride * 4] = tmp[4];
+  to[stride * 5] = tmp[5];
+  to[stride * 6] = tmp[6];
+  to[stride * 7] = tmp[7];
+  to[stride * 8] = tmp[8];
+  to[stride * 9] = tmp[9];
+  to[stride * 10] = tmp[10];
+  to[stride * 11] = tmp[11];
+  to[stride * 12] = tmp[12];
+  to[stride * 13] = tmp[13];
+  to[stride * 14] = tmp[14];
+  to[stride * 15] = tmp[15];
 }
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
@@ -1835,7 +1796,6 @@ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f&
 
 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
 // supported by SSE, and has more range than is needed for exponents.
-// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting.
 template <>
 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   // Clamp exponent to [-2099, 2099]
@@ -1856,6 +1816,24 @@ EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d&
   return out;
 }
 
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-1023, 1024]
+  const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
+  const Packet2d max_exponent = pset1<Packet2d>(1024.0);
+  const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
+
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+
+  // Compute 2^e multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52));  // 2^e
+  return pmul(a, c);
+}
+
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template <>
@@ -1892,220 +1870,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
-template <>
-EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
-  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
-  // (from Nehalem to Haswell)
-  // #ifdef EIGEN_VECTORIZE_SSE3
-  //   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
-  //   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
-  // #else
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-  // #endif
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
-  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
-  // (from Nehalem to Haswell)
-  // #ifdef EIGEN_VECTORIZE_SSE3
-  //   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
-  // #else
-  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
-  // #endif
-}
-
-template <>
-EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
-  return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
-}
-
-#ifdef EIGEN_VECTORIZE_SSSE3
-template <>
-EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
-  Packet4i tmp0 = _mm_hadd_epi32(a, a);
-  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
-  Packet4ui tmp0 = _mm_hadd_epi32(a, a);
-  return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
-}
-#else
-template <>
-EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
-  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
-  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
-  Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
-  return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
-}
-#endif
-
-template <>
-EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
-  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
-}
-
-// Other reduction functions:
-
-// mul
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
-  EIGEN_ALIGN16 int64_t aux[2];
-  pstore(aux, a);
-  return aux[0] * aux[1];
-}
-template <>
-EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (e.g., reusing pmul is very slow!)
-  // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., reusing pmul is very slow !)
-  // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
-  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
-}
-
-// min
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
-  return aux0 < aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
-  return aux0 < aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-
-// max
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
-  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
-  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
-  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
-}
-template <>
-EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
-  return aux0 > aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-template <>
-EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
-#else
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 uint32_t aux[4];
-  pstore(aux, a);
-  uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
-  return aux0 > aux2 ? aux0 : aux2;
-#endif  // EIGEN_VECTORIZE_SSE4_1
-}
-
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
-// {
-//   return _mm_movemask_ps(x) == 0xF;
-// }
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) {
-  return _mm_movemask_pd(x) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
-  return _mm_movemask_ps(x) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) {
-  return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
-}
-
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
-  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
-}
-template <>
-EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
-  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
-}
-
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
 }
@@ -2234,29 +1998,25 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
   kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
 }
 
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
+  return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
+  return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
                                     const Packet2l& elsePacket) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
-  __m128i false_mask = pcmp_eq<Packet2l>(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
-#else
-  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
-#endif
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
                                     const Packet4i& elsePacket) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
-#else
-  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
-#endif
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
@@ -2266,61 +2026,49 @@ EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4u
 template <>
 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
                                     const Packet4f& elsePacket) {
-  const __m128 zero = _mm_setzero_ps();
-  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
-  __m128 false_mask = _mm_cmpeq_ps(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
-#else
-  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
-#endif
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
                                     const Packet2d& elsePacket) {
-  const __m128d zero = _mm_setzero_pd();
-  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
-  __m128d false_mask = _mm_cmpeq_pd(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
-#else
-  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
-#endif
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
 }
 
 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef EIGEN_VECTORIZE_FMA
+#if defined(EIGEN_VECTORIZE_FMA)
 template <>
 EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(a, b, c);
+  return std::fmaf(a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
-  return ::fma(a, b, c);
+  return std::fma(a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(a, b, -c);
+  return std::fmaf(a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
-  return ::fma(a, b, -c);
+  return std::fma(a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a, b, c);
+  return std::fmaf(-a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
-  return ::fma(-a, b, c);
+  return std::fma(-a, b, c);
 }
 template <>
 EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a, b, -c);
+  return std::fmaf(-a, b, -c);
 }
 template <>
 EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
-  return ::fma(-a, b, -c);
+  return std::fma(-a, b, -c);
 }
 #endif
 
@@ -2366,8 +2114,6 @@ EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
 }
 
 EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
-  __m128i o = _mm_setzero_si128();
-
   // unsigned int sign_mask = 0x80000000u;
   __m128i sign = _mm_set1_epi32(0x80000000u);
   // unsigned int sign = f.u & sign_mask;
@@ -2394,7 +2140,7 @@ EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
   //  f.f += denorm_magic.f;
   f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
   // f.u - denorm_magic.u
-  o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
+  __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
   o = _mm_and_si128(o, subnorm_mask);
   // Correct result for inf/nan/zero/subnormal, 0 otherwise
   o = _mm_or_si128(o, naninf_value);
diff --git a/Eigen/src/Core/arch/SSE/Reductions.h b/Eigen/src/Core/arch/SSE/Reductions.h
new file mode 100644
index 0000000..f38df4e
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/Reductions.h
@@ -0,0 +1,324 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_SSE_H
+#define EIGEN_REDUCTIONS_SSE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Packet>
+struct sse_add_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return padd<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_mul_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmul<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_min_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmin<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_min_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmin<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet>
+struct sse_max_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmax<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_max_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmax<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet, typename Op>
+struct sse_predux_common;
+
+template <typename Packet>
+struct sse_predux_impl : sse_predux_common<Packet, sse_add_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_mul_impl : sse_predux_common<Packet, sse_mul_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_min_impl : sse_predux_common<Packet, sse_min_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_min_prop_impl : sse_predux_common<Packet, sse_min_prop_wrapper<NaNPropagation, Packet>> {};
+
+template <typename Packet>
+struct sse_predux_max_impl : sse_predux_common<Packet, sse_max_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_max_prop_impl : sse_predux_common<Packet, sse_max_prop_wrapper<NaNPropagation, Packet>> {};
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16b -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bool predux(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_min(const Packet16b& a) {
+  return predux_mul(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_max(const Packet16b& a) {
+  return predux(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16b& a) {
+  return predux(a);
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4i, Op> {
+  static EIGEN_STRONG_INLINE int run(const Packet4i& a) {
+    Packet4i tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return _mm_cvtsi128_si32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet4i& a) {
+  return sse_predux_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) {
+  return sse_predux_mul_impl<Packet4i>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) {
+  return sse_predux_min_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) {
+  return sse_predux_max_impl<Packet4i>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4i& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4ui, Op> {
+  static EIGEN_STRONG_INLINE uint32_t run(const Packet4ui& a) {
+    Packet4ui tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(tmp));
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) {
+  return sse_predux_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) {
+  return sse_predux_mul_impl<Packet4ui>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) {
+  return sse_predux_min_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) {
+  return sse_predux_max_impl<Packet4ui>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2l, Op> {
+  static EIGEN_STRONG_INLINE int64_t run(const Packet2l& a) {
+    Packet2l tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_epi64(a, a));
+    return pfirst(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) {
+  return sse_predux_impl<Packet2l>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2l& a) {
+  return _mm_movemask_pd(_mm_castsi128_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4f, Op> {
+  static EIGEN_STRONG_INLINE float run(const Packet4f& a) {
+    Packet4f tmp;
+    tmp = Op::packetOp(a, _mm_movehl_ps(a, a));
+#ifdef EIGEN_VECTORIZE_SSE3
+    tmp = Op::packetOp(tmp, _mm_movehdup_ps(tmp));
+#else
+    tmp = Op::packetOp(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+#endif
+    return _mm_cvtss_f32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet4f& a) {
+  return sse_predux_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) {
+  return sse_predux_mul_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) {
+  return sse_predux_min_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) {
+  return sse_predux_max_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& a) {
+  return _mm_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2d, Op> {
+  static EIGEN_STRONG_INLINE double run(const Packet2d& a) {
+    Packet2d tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_pd(a, a));
+    return _mm_cvtsd_f64(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet2d& a) {
+  return sse_predux_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) {
+  return sse_predux_mul_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) {
+  return sse_predux_min_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) {
+  return sse_predux_max_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2d& a) {
+  return _mm_movemask_pd(a) != 0x0;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 6a03de9..6115d1d 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -86,22 +86,22 @@ template <>
 EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a) {
   numext::int32_t c[packet_traits<numext::int32_t>::size];
   for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
-  return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+  return svadd_s32_x(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svadd_s32_z(svptrue_b32(), a, b);
+  return svadd_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svsub_s32_z(svptrue_b32(), a, b);
+  return svsub_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) {
-  return svneg_s32_z(svptrue_b32(), a);
+  return svneg_s32_x(svptrue_b32(), a);
 }
 
 template <>
@@ -111,27 +111,27 @@ EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) {
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svmul_s32_z(svptrue_b32(), a, b);
+  return svmul_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svdiv_s32_z(svptrue_b32(), a, b);
+  return svdiv_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) {
-  return svmla_s32_z(svptrue_b32(), c, a, b);
+  return svmla_s32_x(svptrue_b32(), c, a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svmin_s32_z(svptrue_b32(), a, b);
+  return svmin_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svmax_s32_z(svptrue_b32(), a, b);
+  return svmax_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
@@ -151,47 +151,47 @@ EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi
 
 template <>
 EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/) {
-  return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
+  return svdup_n_s32_x(svptrue_b32(), 0xffffffffu);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/) {
-  return svdup_n_s32_z(svptrue_b32(), 0);
+  return svdup_n_s32_x(svptrue_b32(), 0);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svand_s32_z(svptrue_b32(), a, b);
+  return svand_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svorr_s32_z(svptrue_b32(), a, b);
+  return svorr_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return sveor_s32_z(svptrue_b32(), a, b);
+  return sveor_s32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b) {
-  return svbic_s32_z(svptrue_b32(), a, b);
+  return svbic_s32_x(svptrue_b32(), a, b);
 }
 
 template <int N>
 EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) {
-  return svasrd_n_s32_z(svptrue_b32(), a, N);
+  return svasrd_n_s32_x(svptrue_b32(), a, N);
 }
 
 template <int N>
 EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) {
-  return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), N));
+  return svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), svreinterpret_u32_s32(a), N));
 }
 
 template <int N>
 EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) {
-  return svlsl_n_s32_z(svptrue_b32(), a, N);
+  return svlsl_n_s32_x(svptrue_b32(), a, N);
 }
 
 template <>
@@ -257,7 +257,7 @@ EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) {
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) {
-  return svabs_s32_z(svptrue_b32(), a);
+  return svabs_s32_x(svptrue_b32(), a);
 }
 
 template <>
@@ -270,29 +270,29 @@ EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a) {
   EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
 
   // Multiply the vector by its reverse
-  svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
+  svint32_t prod = svmul_s32_x(svptrue_b32(), a, svrev_s32(a));
   svint32_t half_prod;
 
   // Extract the high half of the vector. Depending on the VL more reductions need to be done
   if (EIGEN_ARM64_SVE_VL >= 2048) {
     half_prod = svtbl_s32(prod, svindex_u32(32, 1));
-    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 1024) {
     half_prod = svtbl_s32(prod, svindex_u32(16, 1));
-    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 512) {
     half_prod = svtbl_s32(prod, svindex_u32(8, 1));
-    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 256) {
     half_prod = svtbl_s32(prod, svindex_u32(4, 1));
-    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
   }
   // Last reduction
   half_prod = svtbl_s32(prod, svindex_u32(2, 1));
-  prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
 
   // The reduction is done to the first element.
   return pfirst<PacketXi>(prod);
@@ -353,15 +353,17 @@ struct packet_traits<float> : default_packet_traits {
     HasReduxp = 0,  // Not implemented in SVE
 
     HasDiv = 1,
-    HasFloor = 1,
 
+    HasCmp = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
     HasExp = 1,
-    HasSqrt = 0,
+    HasPow = 1,
+    HasSqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
   };
 };
 
@@ -387,29 +389,29 @@ EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from) {
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from) {
-  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
+  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), from));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
   float c[packet_traits<float>::size];
   for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
-  return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+  return svadd_f32_x(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svadd_f32_z(svptrue_b32(), a, b);
+  return svadd_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svsub_f32_z(svptrue_b32(), a, b);
+  return svsub_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) {
-  return svneg_f32_z(svptrue_b32(), a);
+  return svneg_f32_x(svptrue_b32(), a);
 }
 
 template <>
@@ -419,22 +421,22 @@ EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) {
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svmul_f32_z(svptrue_b32(), a, b);
+  return svmul_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svdiv_f32_z(svptrue_b32(), a, b);
+  return svdiv_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) {
-  return svmla_f32_z(svptrue_b32(), c, a, b);
+  return svmla_f32_x(svptrue_b32(), c, a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svmin_f32_z(svptrue_b32(), a, b);
+  return svmin_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
@@ -444,12 +446,12 @@ EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, con
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svminnm_f32_z(svptrue_b32(), a, b);
+  return svminnm_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svmax_f32_z(svptrue_b32(), a, b);
+  return svmax_f32_x(svptrue_b32(), a, b);
 }
 
 template <>
@@ -459,7 +461,7 @@ EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, con
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svmaxnm_f32_z(svptrue_b32(), a, b);
+  return svmaxnm_f32_x(svptrue_b32(), a, b);
 }
 
 // Float comparisons in SVE return svbool (predicate). Use svdup to set active
@@ -489,33 +491,33 @@ EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const P
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
-  return svrintm_f32_z(svptrue_b32(), a);
+  return svrintm_f32_x(svptrue_b32(), a);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
-  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
+  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), 0xffffffffu));
 }
 
 // Logical Operations are not supported for float, so reinterpret casts
 template <>
 EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+  return svreinterpret_f32_u32(svand_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+  return svreinterpret_f32_u32(svorr_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+  return svreinterpret_f32_u32(sveor_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+  return svreinterpret_f32_u32(svbic_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
@@ -580,7 +582,7 @@ EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) {
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) {
-  return svabs_f32_z(svptrue_b32(), a);
+  return svabs_f32_x(svptrue_b32(), a);
 }
 
 // TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
@@ -602,29 +604,29 @@ template <>
 EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
   EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
   // Multiply the vector by its reverse
-  svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t prod = svmul_f32_x(svptrue_b32(), a, svrev_f32(a));
   svfloat32_t half_prod;
 
   // Extract the high half of the vector. Depending on the VL more reductions need to be done
   if (EIGEN_ARM64_SVE_VL >= 2048) {
     half_prod = svtbl_f32(prod, svindex_u32(32, 1));
-    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 1024) {
     half_prod = svtbl_f32(prod, svindex_u32(16, 1));
-    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 512) {
     half_prod = svtbl_f32(prod, svindex_u32(8, 1));
-    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
   }
   if (EIGEN_ARM64_SVE_VL >= 256) {
     half_prod = svtbl_f32(prod, svindex_u32(4, 1));
-    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
   }
   // Last reduction
   half_prod = svtbl_f32(prod, svindex_u32(2, 1));
-  prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
 
   // The reduction is done to the first element.
   return pfirst<PacketXf>(prod);
@@ -661,6 +663,11 @@ EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf&
   return pldexp_generic(a, exponent);
 }
 
+template <>
+EIGEN_STRONG_INLINE PacketXf psqrt<PacketXf>(const PacketXf& a) {
+  return svsqrt_f32_x(svptrue_b32(), a);
+}
+
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
index b451676..c7027b3 100644
--- a/Eigen/src/Core/arch/SVE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -28,12 +28,12 @@ struct type_casting_traits<numext::int32_t, float> {
 
 template <>
 EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
-  return svcvt_f32_s32_z(svptrue_b32(), a);
+  return svcvt_f32_s32_x(svptrue_b32(), a);
 }
 
 template <>
 EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
-  return svcvt_s32_f32_z(svptrue_b32(), a);
+  return svcvt_s32_f32_x(svptrue_b32(), a);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 9b89747..692f90f 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -19,14 +19,19 @@ namespace Eigen {
 namespace internal {
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static Packet4ui p4ui_CONJ_XOR = {0x00000000, 0x80000000, 0x00000000,
-                                  0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+inline Packet4ui p4ui_CONJ_XOR() {
+  return Packet4ui {0x00000000, 0x80000000, 0x00000000, 0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+}
 #endif
 
-static Packet2ul p2ul_CONJ_XOR1 =
-    (Packet2ul)vec_sld((Packet4ui)p2d_ZERO_, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 =
-    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_ZERO_, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_ZERO_, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_ZERO_,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
 
 struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
@@ -173,7 +178,7 @@ EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2));
+  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2()));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
@@ -188,7 +193,7 @@ EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1c
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(a_im, b.v, p2d_ZERO);
   v2 = (Packet2d)vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
-  v2 = (Packet2d)vec_xor((Packet2d)v2, (Packet2d)p2ul_CONJ_XOR1);
+  v2 = (Packet2d)vec_xor((Packet2d)v2, (Packet2d)p2ul_CONJ_XOR1());
 
   return Packet1cd(v1 + v2);
 }
@@ -252,8 +257,27 @@ EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1c
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  return plog_complex(a, b);
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex<Packet1cd>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex(a);
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
@@ -432,16 +456,6 @@ EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2c
   return pdiv_complex(a, b);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  return plog_complex(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  return pexp_complex(a, b);
-}
-
 EIGEN_STRONG_INLINE Packet2cf pcplxflip /*<Packet2cf>*/ (const Packet2cf& x) {
   Packet2cf res;
   res.cd[0] = pcplxflip(x.cd[0]);
@@ -472,7 +486,7 @@ EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
-  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
@@ -486,7 +500,7 @@ EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2c
 
   // multiply a_im * b and get the conjugate result
   prod_im = a_im * b.v;
-  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()));
   // permute back to a proper order
   prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
 
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 32e0425..348d643 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -23,6 +23,20 @@ namespace Eigen {
 
 namespace internal {
 
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
+
+EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
+
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
+
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
 static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
@@ -170,7 +184,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(cons
   y = padd(y, p4f_1);
 
   // build 2^n
-  emm0 = (Packet4i){(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
+  emm0 = Packet4i{(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
   emm0 = emm0 + p4i_0x7f;
   emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
 
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 8ac8f77..39073ed 100644
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -180,6 +180,7 @@ struct packet_traits<float> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
 
+    HasCmp = 1,
     HasAdd = 1,
     HasSub = 1,
     HasMul = 1,
@@ -195,9 +196,6 @@ struct packet_traits<float> : default_packet_traits {
     HasRsqrt = 1,
     HasTanh = 1,
     HasErf = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasNegate = 1,
     HasBlend = 1
   };
@@ -225,9 +223,6 @@ struct packet_traits<double> : default_packet_traits {
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
     HasNegate = 1,
     HasBlend = 1
   };
@@ -256,6 +251,7 @@ struct unpacket_traits<Packet4f> {
     masked_store_available = false
   };
   typedef Packet4f half;
+  typedef Packet4i integer_packet;
 };
 template <>
 struct unpacket_traits<Packet2d> {
@@ -268,6 +264,7 @@ struct unpacket_traits<Packet2d> {
     masked_store_available = false
   };
   typedef Packet2d half;
+  typedef Packet2l integer_packet;
 };
 
 /* Forward declaration */
@@ -319,38 +316,36 @@ inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
 
 template <>
 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v4i;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v2d;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v4i = from;
+  vec_xst(from, 0, to);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v2d = from;
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
 }
 
 template <>
@@ -546,7 +541,8 @@ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d
 
 template <>
 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
-  return vec_round(a);
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfidb(a, 0, 1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
@@ -596,6 +592,45 @@ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
   EIGEN_ZVECTOR_PREFETCH(addr);
 }
 
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return Packet4i {
+    parithmetic_shift_right<N>(a[0]),
+    parithmetic_shift_right<N>(a[1]),
+    parithmetic_shift_right<N>(a[2]),
+    parithmetic_shift_right<N>(a[3]) };
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return Packet4i {
+    plogical_shift_right<N>(a[0]),
+    plogical_shift_right<N>(a[1]),
+    plogical_shift_right<N>(a[2]),
+    plogical_shift_right<N>(a[3]) };
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return Packet4i {
+    plogical_shift_left<N>(a[0]),
+    plogical_shift_left<N>(a[1]),
+    plogical_shift_left<N>(a[2]),
+    plogical_shift_left<N>(a[3]) };
+}
+
 template <>
 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
   EIGEN_ALIGN16 int x[4];
@@ -912,8 +947,8 @@ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f
 template <>
 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
   Packet4f res;
-  res.v4f[0] = vec_round(a.v4f[0]);
-  res.v4f[1] = vec_round(a.v4f[1]);
+  res.v4f[0] = generic_round(a.v4f[0]);
+  res.v4f[1] = generic_round(a.v4f[1]);
   return res;
 }
 
@@ -1073,20 +1108,14 @@ Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f
 #else
 template <>
 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet* vfrom;
-  vfrom = (Packet*)from;
-  return vfrom->v4f;
+  return vec_xl(0, from);
 }
 
 template <>
 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
-  // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet* vto;
-  vto = (Packet*)to;
-  vto->v4f = from;
+  vec_xst(from, 0, to);
 }
 
 template <>
@@ -1177,7 +1206,8 @@ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
-  return vec_round(a);
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfisb(a, 0, 1);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
@@ -1268,6 +1298,28 @@ EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f&
 
 #endif
 
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l bias = {1023, 1023};
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                        // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                   // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));  // 2^(e - 3b)
+  out = pmul(out, c);                                                 // a * 2^e
+  return out;
+}
+
 template <>
 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
   EIGEN_ZVECTOR_PREFETCH(addr);
@@ -1285,6 +1337,75 @@ EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
   return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
 }
 
+#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
+#pragma GCC warning \
+    "float->int and int->float conversion is simulated. compile for z15 for improved performance"
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
+    return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
+    return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
+    return Packet2d{double(a[0]), double(a[1]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
+    return Packet2l{(long long)(a[0]), (long long)(a[1]) };
+  }
+};
+#else
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
+    return vec_float(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
+    return vec_signed(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
+    return vec_double(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
+    return vec_signed(a);
+  }
+};
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 09d1da8..0239262 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -23,11 +23,16 @@ namespace internal {
  */
 template <typename DstScalar, typename SrcScalar>
 struct assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
 
   template <int Alignment, typename Packet>
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
-    internal::pstoret<DstScalar, Packet, Alignment>(a, b);
+    pstoret<DstScalar, Packet, Alignment>(a, b);
+  }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
   }
 };
 
@@ -36,7 +41,7 @@ template <typename DstScalar>
 struct assign_op<DstScalar, void> {};
 
 template <typename DstScalar, typename SrcScalar>
-struct functor_traits<assign_op<DstScalar, SrcScalar> > {
+struct functor_traits<assign_op<DstScalar, SrcScalar>> {
   enum {
     Cost = NumTraits<DstScalar>::ReadCost,
     PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::Vectorizable &&
@@ -45,88 +50,76 @@ struct functor_traits<assign_op<DstScalar, SrcScalar> > {
 };
 
 /** \internal
- * \brief Template functor for scalar/packet assignment with addition
+ * \brief Template functor for scalar/packet compound assignment
  *
  */
-template <typename DstScalar, typename SrcScalar>
-struct add_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct compound_assign_op {
+  using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
+    assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b));
+  }
 
   template <int Alignment, typename Packet>
   EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
-    internal::pstoret<DstScalar, Packet, Alignment>(a, internal::padd(internal::ploadt<Packet, Alignment>(a), b));
+    assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
+        a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
+  }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
+        a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
   }
 };
-template <typename DstScalar, typename SrcScalar>
-struct functor_traits<add_assign_op<DstScalar, SrcScalar> > {
+
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>> {
   enum {
-    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
-    PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasAdd
+    Cost = int(functor_traits<assign_op<DstScalar, DstScalar>>::Cost) + int(functor_traits<Func>::Cost),
+    PacketAccess = functor_traits<assign_op<DstScalar, DstScalar>>::PacketAccess && functor_traits<Func>::PacketAccess
   };
 };
 
 /** \internal
- * \brief Template functor for scalar/packet assignment with subtraction
+ * \brief Template functor for scalar/packet assignment with addition
  *
  */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct add_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_sum_op<DstScalar, SrcScalar>> {};
+
 template <typename DstScalar, typename SrcScalar>
-struct sub_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
+struct functor_traits<add_assign_op<DstScalar, SrcScalar>> : add_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with subtraction
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct sub_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_difference_op<DstScalar, SrcScalar>> {};
 
-  template <int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
-    internal::pstoret<DstScalar, Packet, Alignment>(a, internal::psub(internal::ploadt<Packet, Alignment>(a), b));
-  }
-};
 template <typename DstScalar, typename SrcScalar>
-struct functor_traits<sub_assign_op<DstScalar, SrcScalar> > {
-  enum {
-    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
-    PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasSub
-  };
-};
+struct functor_traits<sub_assign_op<DstScalar, SrcScalar>> : sub_assign_op<DstScalar, SrcScalar>::traits {};
 
 /** \internal
  * \brief Template functor for scalar/packet assignment with multiplication
  *
  */
 template <typename DstScalar, typename SrcScalar = DstScalar>
-struct mul_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
+struct mul_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_product_op<DstScalar, SrcScalar>> {};
 
-  template <int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
-    internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pmul(internal::ploadt<Packet, Alignment>(a), b));
-  }
-};
 template <typename DstScalar, typename SrcScalar>
-struct functor_traits<mul_assign_op<DstScalar, SrcScalar> > {
-  enum {
-    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
-    PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasMul
-  };
-};
+struct functor_traits<mul_assign_op<DstScalar, SrcScalar>> : mul_assign_op<DstScalar, SrcScalar>::traits {};
 
 /** \internal
- * \brief Template functor for scalar/packet assignment with diviving
+ * \brief Template functor for scalar/packet assignment with dividing
  *
  */
 template <typename DstScalar, typename SrcScalar = DstScalar>
-struct div_assign_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
+struct div_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_quotient_op<DstScalar, SrcScalar>> {};
 
-  template <int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
-    internal::pstoret<DstScalar, Packet, Alignment>(a, internal::pdiv(internal::ploadt<Packet, Alignment>(a), b));
-  }
-};
 template <typename DstScalar, typename SrcScalar>
-struct functor_traits<div_assign_op<DstScalar, SrcScalar> > {
-  enum {
-    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
-    PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::HasDiv
-  };
-};
+struct functor_traits<div_assign_op<DstScalar, SrcScalar>> : div_assign_op<DstScalar, SrcScalar>::traits {};
 
 /** \internal
  * \brief Template functor for scalar/packet assignment with swapping
@@ -158,7 +151,7 @@ struct swap_assign_op {
   }
 };
 template <typename Scalar>
-struct functor_traits<swap_assign_op<Scalar> > {
+struct functor_traits<swap_assign_op<Scalar>> {
   enum {
     Cost = 3 * NumTraits<Scalar>::ReadCost,
     PacketAccess =
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index c91e6bb..85e1584 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -207,20 +207,9 @@ struct functor_traits<scalar_cmp_op<LhsScalar, RhsScalar, cmp, UseTypedComparato
   };
 };
 
-template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
-struct typed_cmp_helper {
-  static constexpr bool SameType = is_same<LhsScalar, RhsScalar>::value;
-  static constexpr bool IsNumeric = is_arithmetic<typename NumTraits<LhsScalar>::Real>::value;
-  static constexpr bool UseTyped = UseTypedComparators && SameType && IsNumeric;
-  using type = typename conditional<UseTyped, LhsScalar, bool>::type;
-};
-
-template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
-using cmp_return_t = typename typed_cmp_helper<LhsScalar, RhsScalar, UseTypedComparators>::type;
-
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a == b ? result_type(1) : result_type(0);
   }
@@ -233,7 +222,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a < b ? result_type(1) : result_type(0);
   }
@@ -246,7 +235,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a <= b ? result_type(1) : result_type(0);
   }
@@ -259,7 +248,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a > b ? result_type(1) : result_type(0);
   }
@@ -272,7 +261,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a >= b ? result_type(1) : result_type(0);
   }
@@ -285,7 +274,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return !(a <= b || b <= a) ? result_type(1) : result_type(0);
   }
@@ -298,7 +287,7 @@ struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : bin
 
 template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
 struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_NEQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
-  using result_type = cmp_return_t<LhsScalar, RhsScalar, UseTypedComparators>;
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
     return a != b ? result_type(1) : result_type(0);
   }
@@ -362,11 +351,7 @@ template <typename Scalar, typename Exponent>
 struct functor_traits<scalar_pow_op<Scalar, Exponent>> {
   enum {
     Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger && packet_traits<Scalar>::HasExp &&
-                    packet_traits<Scalar>::HasLog && packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
-                    // Temporarily disable packet access for half/bfloat16 until
-                    // accuracy is improved.
-                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value)
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger && packet_traits<Scalar>::HasPow)
   };
 };
 
@@ -438,7 +423,6 @@ struct scalar_quotient_op : binary_op_base<LhsScalar, RhsScalar> {
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
-    maybe_raise_div_by_zero<Packet>::run(b);
     return internal::pdiv(a, b);
   }
 };
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index c53bb90..35dc738 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -19,7 +19,6 @@ namespace internal {
 
 template <typename Scalar>
 struct scalar_constant_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) {}
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return m_other; }
   template <typename PacketType>
@@ -29,7 +28,7 @@ struct scalar_constant_op {
   const Scalar m_other;
 };
 template <typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> > {
+struct functor_traits<scalar_constant_op<Scalar>> {
   enum {
     Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
     PacketAccess = packet_traits<Scalar>::Vectorizable,
@@ -37,6 +36,18 @@ struct functor_traits<scalar_constant_op<Scalar> > {
   };
 };
 
+template <typename Scalar>
+struct scalar_zero_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_zero_op() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return Scalar(0); }
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pzero<PacketType>(PacketType());
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_zero_op<Scalar>> : functor_traits<scalar_constant_op<Scalar>> {};
+
 template <typename Scalar>
 struct scalar_identity_op {
   template <typename IndexType>
@@ -45,7 +56,7 @@ struct scalar_identity_op {
   }
 };
 template <typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> > {
+struct functor_traits<scalar_identity_op<Scalar>> {
   enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
 };
 
@@ -75,18 +86,19 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
     // Principle:
     // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    Packet low = pset1<Packet>(m_low);
+    Packet high = pset1<Packet>(m_high);
+    Packet step = pset1<Packet>(m_step);
     if (m_flip) {
       Packet pi = plset<Packet>(Scalar(i - m_size1));
-      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
-      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
-      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+      Packet res = pmadd(step, pi, high);
+      Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
+      return pselect<Packet>(mask, res, low);
     } else {
       Packet pi = plset<Packet>(Scalar(i));
-      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
-      if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits<Packet>::size + 1)) return res;
-      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size - 1));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+      Packet res = pmadd(step, pi, low);
+      Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
+      return pselect<Packet>(mask, res, high);
     }
   }
 
@@ -128,11 +140,10 @@ struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
 template <typename Scalar>
 struct linspaced_op;
 template <typename Scalar>
-struct functor_traits<linspaced_op<Scalar> > {
+struct functor_traits<linspaced_op<Scalar>> {
   enum {
     Cost = 1,
-    PacketAccess =
-        (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+    PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
     /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/  // <- vectorization for integer is
                                                                                 // currently disabled
     IsRepeatable = true
@@ -182,7 +193,7 @@ struct equalspaced_op {
 };
 
 template <typename Scalar>
-struct functor_traits<equalspaced_op<Scalar> > {
+struct functor_traits<equalspaced_op<Scalar>> {
   enum {
     Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
     PacketAccess =
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 4447b82..ba7d97a 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -24,7 +24,7 @@ namespace internal {
  */
 template <typename Scalar>
 struct scalar_opposite_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return -a; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::negate(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
     return internal::pnegate(a);
@@ -103,6 +103,26 @@ struct functor_traits<scalar_abs2_op<Scalar>> {
   enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 };
 };
 
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct squared_norm_functor {
+  typedef Scalar result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return Scalar(numext::real(a) * numext::real(a), numext::imag(a) * numext::imag(a));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return Packet(pmul(a.v, a.v));
+  }
+};
+template <typename Scalar>
+struct squared_norm_functor<Scalar, false> : scalar_abs2_op<Scalar> {};
+
+template <typename Scalar>
+struct functor_traits<squared_norm_functor<Scalar>> {
+  using Real = typename NumTraits<Scalar>::Real;
+  enum { Cost = NumTraits<Real>::MulCost, PacketAccess = packet_traits<Real>::HasMul };
+};
+
 /** \internal
  * \brief Template functor to compute the conjugate of a complex value
  *
@@ -219,7 +239,9 @@ struct functor_traits<core_cast_op<SrcType, DstType>> {
  */
 template <typename Scalar, int N>
 struct scalar_shift_right_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return a >> N; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return numext::arithmetic_shift_right(a);
+  }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
     return internal::parithmetic_shift_right<N>(a);
@@ -237,7 +259,9 @@ struct functor_traits<scalar_shift_right_op<Scalar, N>> {
  */
 template <typename Scalar, int N>
 struct scalar_shift_left_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return a << N; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return numext::logical_shift_left(a);
+  }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
     return internal::plogical_shift_left<N>(a);
@@ -352,6 +376,22 @@ struct functor_traits<scalar_exp_op<Scalar>> {
   };
 };
 
+template <typename Scalar>
+struct scalar_exp2_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pexp2(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_exp2_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of exp2
+  };
+};
+
 /** \internal
  *
  * \brief Template functor to compute the exponential of a scalar - 1.
@@ -455,8 +495,9 @@ struct functor_traits<scalar_log10_op<Scalar>> {
  */
 template <typename Scalar>
 struct scalar_log2_op {
+  using RealScalar = typename NumTraits<Scalar>::Real;
   EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
-    return Scalar(EIGEN_LOG2E) * numext::log(a);
+    return Scalar(RealScalar(EIGEN_LOG2E)) * numext::log(a);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
@@ -517,11 +558,15 @@ struct functor_traits<scalar_sqrt_op<bool>> {
 template <typename Scalar>
 struct scalar_cbrt_op {
   EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pcbrt(a);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_cbrt_op<Scalar>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum { Cost = 20 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCbrt };
 };
 
 /** \internal
@@ -881,7 +926,7 @@ template <typename Scalar>
 struct functor_traits<scalar_floor_op<Scalar>> {
   enum {
     Cost = NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasFloor || NumTraits<Scalar>::IsInteger
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
   };
 };
 
@@ -901,7 +946,7 @@ template <typename Scalar>
 struct functor_traits<scalar_rint_op<Scalar>> {
   enum {
     Cost = NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasRint || NumTraits<Scalar>::IsInteger
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
   };
 };
 
@@ -921,7 +966,27 @@ template <typename Scalar>
 struct functor_traits<scalar_ceil_op<Scalar>> {
   enum {
     Cost = NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCeil || NumTraits<Scalar>::IsInteger
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the truncation of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::floor()
+ */
+template <typename Scalar>
+struct scalar_trunc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::ptrunc(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_trunc_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
   };
 };
 
@@ -964,10 +1029,9 @@ struct functor_traits<scalar_isnan_op<Scalar, UseTypedPredicate>> {
  * \brief Template functor to check whether a scalar is +/-inf
  * \sa class CwiseUnaryOp, ArrayBase::isinf()
  */
-template <typename Scalar>
+template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isinf_op {
-  typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isinf(a);
 #else
@@ -975,19 +1039,33 @@ struct scalar_isinf_op {
 #endif
   }
 };
+
 template <typename Scalar>
-struct functor_traits<scalar_isinf_op<Scalar>> {
-  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = false };
+struct scalar_isinf_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return (numext::isinf(a) ? ptrue(a) : pzero(a));
+#else
+    return (numext::isinf EIGEN_NOT_A_MACRO(a) ? ptrue(a) : pzero(a));
+#endif
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return pisinf(a);
+  }
+};
+template <typename Scalar, bool UseTypedPredicate>
+struct functor_traits<scalar_isinf_op<Scalar, UseTypedPredicate>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCmp && UseTypedPredicate };
 };
 
 /** \internal
  * \brief Template functor to check whether a scalar has a finite value
  * \sa class CwiseUnaryOp, ArrayBase::isfinite()
  */
-template <typename Scalar>
+template <typename Scalar, bool UseTypedPredicate = false>
 struct scalar_isfinite_op {
-  typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
 #if defined(SYCL_DEVICE_ONLY)
     return numext::isfinite(a);
 #else
@@ -995,9 +1073,25 @@ struct scalar_isfinite_op {
 #endif
   }
 };
+
 template <typename Scalar>
-struct functor_traits<scalar_isfinite_op<Scalar>> {
-  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = false };
+struct scalar_isfinite_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return (numext::isfinite(a) ? ptrue(a) : pzero(a));
+#else
+    return (numext::isfinite EIGEN_NOT_A_MACRO(a) ? ptrue(a) : pzero(a));
+#endif
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    constexpr Scalar inf = NumTraits<Scalar>::infinity();
+    return pcmp_lt(pabs(a), pset1<Packet>(inf));
+  }
+};
+template <typename Scalar, bool UseTypedPredicate>
+struct functor_traits<scalar_isfinite_op<Scalar, UseTypedPredicate>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCmp && UseTypedPredicate };
 };
 
 /** \internal
@@ -1126,7 +1220,7 @@ struct scalar_logistic_op : scalar_logistic_op_impl<T> {};
 
 // TODO(rmlarsen): Enable the following on host when integer_packet is defined
 // for the relevant packet types.
-#ifdef EIGEN_GPU_CC
+#ifndef EIGEN_GPUCC
 
 /** \internal
  * \brief Template specialization of the logistic function for float.
@@ -1202,7 +1296,7 @@ struct scalar_logistic_op<float> {
     p = pmadd(r2, p, p_low);
 
     // 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r).
-    Packet e = pldexp_fast_impl<Packet>::run(p, m);
+    Packet e = pldexp_fast(p, m);
 
     // 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2.
     e = pmul(e, e);
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c4fa771..e72c6b4 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -305,7 +305,8 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
- * dimension.
+ *                         dimension.
+ * \param[in] num_threads Input: the number of threads used for the computation.
  *
  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
  * this function computes the blocking size parameters along the respective dimensions
@@ -718,10 +719,10 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_,
     LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
     RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
     RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
-    // FIXME: should depend on NumberOfRegisters
     nr = 4,
-    mr = ResPacketSize,
+    mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
 
     LhsProgress = ResPacketSize,
     RhsProgress = 1
@@ -795,8 +796,8 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_,
                                                                                          DoublePacket<ResPacketType>& c,
                                                                                          TmpType& /*tmp*/,
                                                                                          const LaneIdType&) const {
-    c.first = padd(pmul(a, b.first), c.first);
-    c.second = padd(pmul(a, b.second), c.second);
+    c.first = pmadd(a, b.first, c.first);
+    c.second = pmadd(a, b.second, c.second);
   }
 
   template <typename LaneIdType>
@@ -1117,7 +1118,7 @@ struct lhs_process_one_packet {
     // loops on each largest micro horizontal panel of lhs
     // (LhsProgress x depth)
     for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
@@ -1257,7 +1258,7 @@ struct lhs_process_one_packet {
         traits.initAcc(C3);
         // To improve instruction pipelining, let's double the accumulation registers:
         //  even k will accumulate in C*, while odd k will accumulate in D*.
-        // This trick is crutial to get good performance with FMA, otherwise it is
+        // This trick is crucial to get good performance with FMA, otherwise it is
         // actually faster to perform separated MUL+ADD because of a naturally
         // better instruction-level parallelism.
         AccPacket D0, D1, D2, D3;
@@ -1467,7 +1468,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
                                                 (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
     for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
       const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
@@ -1935,7 +1936,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
 
     for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
       Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
       EIGEN_IF_CONSTEXPR(nr >= 8) {
         for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
           for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
@@ -2326,7 +2327,7 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
   }
   //---------- Process remaining rows, 1 at once ----------
   if (peeled_mc_quarter < rows) {
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
     EIGEN_IF_CONSTEXPR(nr >= 8) {
       // loop on each panel of the rhs
       for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
@@ -2852,7 +2853,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
   Index count = 0;
   const Index peeled_k = (depth / PacketSize) * PacketSize;
 
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
   EIGEN_IF_CONSTEXPR(nr >= 8) {
     for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
       // skip what we have before
@@ -3035,7 +3036,7 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMo
     Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
     Index count = 0;
 
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
     EIGEN_IF_CONSTEXPR(nr >= 8) {
       for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
         // skip what we have before
@@ -3130,9 +3131,8 @@ inline std::ptrdiff_t l2CacheSize() {
   return l2;
 }
 
-/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
-rs.
-* \sa setCpuCacheSize */
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
 inline std::ptrdiff_t l3CacheSize() {
   std::ptrdiff_t l1, l2, l3;
   internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index e9d0cae..ebfac01 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -97,6 +97,7 @@ struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, Conjugat
         // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to
         // use it.
         while (info->task_info[tid].users != 0) {
+          std::this_thread::yield();
         }
         info->task_info[tid].users = threads;
 
@@ -115,6 +116,7 @@ struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, Conjugat
           // However, no need to wait for the B' part which has been updated by the current thread!
           if (shift > 0) {
             while (info->task_info[i].sync != k) {
+              std::this_thread::yield();
             }
           }
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index ac94b3f..bf27567 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -68,6 +68,10 @@ struct general_matrix_matrix_triangular_product<Index, LhsScalar, LhsStorageOrde
                                       const RhsScalar* rhs_, Index rhsStride, ResScalar* res_, Index resIncr,
                                       Index resStride, const ResScalar& alpha,
                                       level3_blocking<LhsScalar, RhsScalar>& blocking) {
+    if (size == 0) {
+      return;
+    }
+
     typedef gebp_traits<LhsScalar, RhsScalar> Traits;
 
     typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
@@ -157,7 +161,7 @@ struct tribb_kernel {
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
     gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
 
-    Matrix<ResScalar, BlockSize, BlockSize, ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
+    Matrix<ResScalar, BlockSize, BlockSize, ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
     // again, each is split into three parts, etc.
@@ -304,15 +308,19 @@ struct general_product_to_triangular_selector<MatrixType, ProductType, UpLo, fal
   }
 };
 
-template <typename MatrixType, unsigned int UpLo>
+template <typename MatrixType_, unsigned int Mode_>
 template <typename ProductType>
-EIGEN_DEVICE_FUNC TriangularView<MatrixType, UpLo>& TriangularViewImpl<MatrixType, UpLo, Dense>::_assignProduct(
-    const ProductType& prod, const Scalar& alpha, bool beta) {
-  EIGEN_STATIC_ASSERT((UpLo & UnitDiag) == 0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename TriangularViewImpl<MatrixType_, Mode_, Dense>::TriangularViewType&
+TriangularViewImpl<MatrixType_, Mode_, Dense>::_assignProduct(
+    const ProductType& prod, const typename TriangularViewImpl<MatrixType_, Mode_, Dense>::Scalar& alpha, bool beta) {
+  EIGEN_STATIC_ASSERT((Mode_ & UnitDiag) == 0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
 
-  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize == 1>::
-      run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+  general_product_to_triangular_selector<MatrixType_, ProductType, Mode_,
+                                         internal::traits<ProductType>::InnerSize == 1>::run(derived()
+                                                                                                 .nestedExpression()
+                                                                                                 .const_cast_derived(),
+                                                                                             prod, alpha, beta);
 
   return derived();
 }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index e138535..6817cc0 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -137,7 +137,7 @@ EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
 EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
 #endif
 
-// TODO hanlde complex cases
+// TODO handle complex cases
 // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
 // EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 56743da..913beb6 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -55,7 +55,7 @@ namespace internal {
                                        ConjugateRhs, ColMajor, 1> {                                                 \
     typedef gebp_traits<EIGTYPE, EIGTYPE> Traits;                                                                   \
                                                                                                                     \
-    static void run(Index rows, Index cols, Index depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
+    static void run(Index rows, Index cols, Index depth, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_, \
                     Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                   \
                     level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, GemmParallelInfo<Index>* /*info = 0*/) {       \
       using std::conj;                                                                                              \
@@ -84,20 +84,20 @@ namespace internal {
                                                                                                                     \
       /* Set a, b, c */                                                                                             \
       if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, m, k, OuterStride<>(lhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));                 \
         a_tmp = lhs.conjugate();                                                                                    \
         a = a_tmp.data();                                                                                           \
         lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
       } else                                                                                                        \
-        a = _lhs;                                                                                                   \
+        a = lhs_;                                                                                                   \
                                                                                                                     \
       if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {                                                        \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, k, n, OuterStride<>(rhsStride));                 \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));                 \
         b_tmp = rhs.conjugate();                                                                                    \
         b = b_tmp.data();                                                                                           \
         ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                        \
       } else                                                                                                        \
-        b = _rhs;                                                                                                   \
+        b = rhs_;                                                                                                   \
                                                                                                                     \
       BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,   \
                (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);           \
@@ -116,6 +116,88 @@ GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
 GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
 #endif
 
+// If OpenBLAS with BUILD_BFLOAT16=1 support is available,
+// use sbgemm for bfloat16.
+#if EIGEN_USE_OPENBLAS_BFLOAT16
+
+extern "C" {
+// OpenBLAS prototype.
+void sbgemm_(const char* trans_a, const char* trans_b, const int* M, const int* N, const int* K, const float* alpha,
+             const Eigen::bfloat16* A, const int* lda, const Eigen::bfloat16* B, const int* ldb, const float* beta,
+             float* C, const int* ldc);
+}  // extern "C"
+
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, ConjugateLhs, Eigen::bfloat16,
+                                     RhsStorageOrder, ConjugateRhs, ColMajor, 1> {
+  typedef gebp_traits<Eigen::bfloat16, Eigen::bfloat16> Traits;
+
+  static void run(Index rows, Index cols, Index depth, const Eigen::bfloat16* lhs_, Index lhsStride,
+                  const Eigen::bfloat16* rhs_, Index rhsStride, Eigen::bfloat16* res, Index resIncr, Index resStride,
+                  Eigen::bfloat16 alpha, level3_blocking<Eigen::bfloat16, Eigen::bfloat16>& /*blocking*/,
+                  GemmParallelInfo<Index>* /*info = 0*/) {
+    using std::conj;
+    if (rows == 0 || cols == 0 || depth == 0) return;
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr);
+    eigen_assert(resIncr == 1);
+    char transa, transb;
+    BlasIndex m, n, k, lda, ldb, ldc;
+    const Eigen::bfloat16 *a, *b;
+
+    float falpha = static_cast<float>(alpha);
+    float fbeta = float(1.0);
+
+    using MatrixXbf16 = Matrix<Eigen::bfloat16, Dynamic, Dynamic>;
+    MatrixXbf16 a_tmp, b_tmp;
+    MatrixXf r_tmp;
+
+    /* Set transpose options */
+    transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';
+    transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';
+
+    /* Set m, n, k */
+    m = convert_index<BlasIndex>(rows);
+    n = convert_index<BlasIndex>(cols);
+    k = convert_index<BlasIndex>(depth);
+
+    /* Set lda, ldb, ldc */
+    lda = convert_index<BlasIndex>(lhsStride);
+    ldb = convert_index<BlasIndex>(rhsStride);
+    ldc = convert_index<BlasIndex>(m);
+
+    /* Set a, b, c */
+    if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));
+      a_tmp = lhs.conjugate();
+      a = a_tmp.data();
+      lda = convert_index<BlasIndex>(a_tmp.outerStride());
+    } else {
+      a = lhs_;
+    }
+
+    if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));
+      b_tmp = rhs.conjugate();
+      b = b_tmp.data();
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());
+    } else {
+      b = rhs_;
+    }
+
+    // Evaluate to a temporary intermediate array.
+    r_tmp.resize(m, n);
+
+    sbgemm_(&transa, &transb, &m, &n, &k, (const float*)&numext::real_ref(falpha), a, &lda, b, &ldb,
+            (const float*)&numext::real_ref(fbeta), r_tmp.data(), &ldc);
+
+    // Cast to the output.
+    Map<MatrixXbf16, 0, OuterStride<> > result(res, m, n, OuterStride<>(resStride));
+    result = r_tmp.cast<Eigen::bfloat16>();
+  }
+};
+
+#endif  // EIGEN_USE_OPENBLAS_SBGEMM
+
 }  // namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index afd8155..ba72a8a 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -64,7 +64,7 @@ class gemv_traits {
 
 /* Optimized col-major matrix * vector product:
  * This algorithm processes the matrix per vertical panels,
- * which are then processed horizontally per chunck of 8*PacketSize x 1 vertical segments.
+ * which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -112,7 +112,7 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLh
   eigen_internal_assert(resIncr == 1);
 
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
-  // This helps GCC to generate propoer code.
+  // This helps GCC to generate proper code.
   LhsMapper lhs(alhs);
 
   conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
@@ -302,7 +302,7 @@ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLh
                               Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
                                             ResScalar* res, Index resIncr, ResScalar alpha) {
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
-  // This helps GCC to generate propoer code.
+  // This helps GCC to generate proper code.
   LhsMapper lhs(alhs);
 
   eigen_internal_assert(rhs.stride() == 1);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 667fea2..b1b89ef 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -47,7 +47,7 @@ inline void manage_multi_threading(Action action, int* v);
 // Public APIs.
 
 /** Must be call first when calling Eigen from multiple threads */
-EIGEN_DEPRECATED inline void initParallel() {}
+EIGEN_DEPRECATED_WITH_REASON("Initialization is no longer needed.") inline void initParallel() {}
 
 /** \returns the max number of threads reserved for Eigen
  * \sa setNbThreads */
@@ -71,7 +71,7 @@ inline void setNbThreads(int v) { internal::manage_multi_threading(SetAction, &v
 // TODO(rmlarsen): Make the device API available instead of
 // storing a local static pointer variable to avoid this issue.
 inline ThreadPool* setGemmThreadPool(ThreadPool* new_pool) {
-  static ThreadPool* pool;
+  static ThreadPool* pool = nullptr;
   if (new_pool != nullptr) {
     // This will wait for work in all threads in *pool to finish,
     // then destroy the old ThreadPool, and then replace it with new_pool.
@@ -153,7 +153,14 @@ inline void manage_multi_threading(Action action, int* v) {
 #endif
   } else if (action == GetAction) {
     eigen_internal_assert(v != nullptr);
+#if defined(EIGEN_HAS_OPENMP)
+    if (m_maxThreads > 0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+#else
     *v = m_maxThreads;
+#endif
   } else {
     eigen_internal_assert(false);
   }
@@ -210,7 +217,7 @@ EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index
     // Note that the actual number of threads might be lower than the number of
     // requested ones
     Index actual_threads = omp_get_num_threads();
-    GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+    GemmParallelInfo<Index> info(static_cast<int>(i), static_cast<int>(actual_threads), task_info);
 
     Index blockCols = (cols / actual_threads) & ~Index(0x3);
     Index blockRows = (rows / actual_threads);
@@ -232,7 +239,6 @@ EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index
   }
 
 #elif defined(EIGEN_GEMM_THREADPOOL)
-  ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>, meta_info, threads, 0);
   Barrier barrier(threads);
   auto task = [=, &func, &barrier, &task_info](int i) {
     Index actual_threads = threads;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 9333d16..580f6a8 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -164,6 +164,11 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
 
   enum { LhsUpLo = LhsMode & (Upper | Lower) };
 
+  // Verify that the Rhs is a vector in the correct orientation.
+  // Otherwise, we break the assumption that we are multiplying
+  // MxN * Nx1.
+  static_assert(Rhs::ColsAtCompileTime == 1, "The RHS must be a column vector.");
+
   template <typename Dest>
   static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     typedef typename Dest::Scalar ResScalar;
@@ -197,6 +202,7 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
 
     if (!EvalToDest) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = Dest::SizeAtCompileTime;
       Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
@@ -205,6 +211,7 @@ struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
 
     if (!UseRhs) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
       Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index c541909..a0d05ef 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -113,13 +113,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<
   ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
   ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-  // To work around an "error: member reference base type 'Matrix<...>
-  // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
-  // not a structure or union" compilation error in nvcc (tested V8.0.61),
-  // create a dummy internal::constructor_without_unaligned_array_assert
-  // object to pass to the Matrix constructor.
-  internal::constructor_without_unaligned_array_assert a;
-  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, LhsStorageOrder> triangularBuffer(a);
+  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, LhsStorageOrder> triangularBuffer;
   triangularBuffer.setZero();
   if ((Mode & ZeroDiag) == ZeroDiag)
     triangularBuffer.diagonal().setZero();
@@ -245,8 +239,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<
   ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
   ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-  internal::constructor_without_unaligned_array_assert a;
-  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, RhsStorageOrder> triangularBuffer(a);
+  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, RhsStorageOrder> triangularBuffer;
   triangularBuffer.setZero();
   if ((Mode & ZeroDiag) == ZeroDiag)
     triangularBuffer.diagonal().setZero();
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 05a5827..bef4cba 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -230,6 +230,7 @@ struct trmv_selector<Mode, ColMajor> {
 
     if (!evalToDest) {
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = Dest::SizeAtCompileTime;
       Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
@@ -310,6 +311,7 @@ struct trmv_selector<Mode, RowMajor> {
 #endif
       }
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
       Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
 #endif
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 2122af9..8244758 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -57,7 +57,7 @@ EIGEN_STRONG_INLINE void trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageO
     Index rs = size - k - 1;  // remaining size
     Index s = TriStorageOrder == RowMajor ? (IsLower ? 0 : i + 1) : IsLower ? i + 1 : i - rs;
 
-    Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(Scalar(1)/conj(tri(i,i)));
+    Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(Scalar(1) / conj(tri(i, i)));
     for (Index j = 0; j < otherSize; ++j) {
       if (TriStorageOrder == RowMajor) {
         Scalar b(0);
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index c2994b2..19d9917 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -241,7 +241,7 @@ class blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, 1> {
 
   EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
   EIGEN_DEVICE_FUNC const Index incr() const { return 1; }
-  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
     if (std::uintptr_t(m_data) % sizeof(Scalar)) {
@@ -430,7 +430,7 @@ class blas_data_mapper {
 
   EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
   EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); }
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() const { return m_data; }
 
  protected:
   Scalar* EIGEN_RESTRICT m_data;
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1c72173..c2546a0 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -38,6 +38,19 @@
 #define EIGEN_ALIGNOF(x) alignof(x)
 #endif
 
+// Align to the boundary that avoids false sharing.
+//   https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+// There is a bug in android NDK < r26 where the macro is defined but std::hardware_destructive_interference_size
+// still does not exist.
+#if defined(__cpp_lib_hardware_interference_size) && __cpp_lib_hardware_interference_size >= 201603 && \
+    (!EIGEN_OS_ANDROID || __NDK_MAJOR__ + 0 >= 26)
+#include <new>
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
+#else
+// Overalign for the cache line size of 128 bytes (Apple M1)
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
+#endif
+
 // If the user explicitly disable vectorization, then we also disable alignment
 #if defined(EIGEN_DONT_VECTORIZE)
 #if defined(EIGEN_GPUCC)
@@ -90,8 +103,8 @@
 // certain common platform (compiler+architecture combinations) to avoid these problems.
 // Only static alignment is really problematic (relies on nonstandard compiler extensions),
 // try to keep heap alignment even when we have to disable static alignment.
-#if EIGEN_COMP_GNUC && \
-    !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \
+                         EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64)
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
 #else
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
@@ -215,7 +228,7 @@
 #define EIGEN_VECTORIZE_SSE4_2
 #endif
 #ifdef __AVX__
-#ifndef EIGEN_USE_SYCL
+#if !defined(EIGEN_USE_SYCL) && !EIGEN_COMP_EMSCRIPTEN
 #define EIGEN_VECTORIZE_AVX
 #endif
 #define EIGEN_VECTORIZE_SSE3
@@ -272,6 +285,8 @@
 #ifdef __AVX512FP16__
 #ifdef __AVX512VL__
 #define EIGEN_VECTORIZE_AVX512FP16
+// Built-in _Float16.
+#define EIGEN_HAS_BUILTIN_FLOAT16 1
 #else
 #if EIGEN_COMP_GNUC
 #error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
@@ -420,6 +435,12 @@ extern "C" {
 #include <msa.h>
 #endif
 
+#elif (defined __loongarch64 && defined __loongarch_sx)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_LSX
+#include <lsxintrin.h>
+
 #elif defined __HVX__ && (__HVX_LENGTH__ == 128)
 
 #define EIGEN_VECTORIZE
@@ -510,6 +531,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
   return "S390X ZVECTOR";
 #elif defined(EIGEN_VECTORIZE_MSA)
   return "MIPS MSA";
+#elif defined(EIGEN_VECTORIZE_LSX)
+  return "LOONGARCH64 LSX";
 #else
   return "None";
 #endif
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 8b06c67..fcc2db8 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -29,9 +29,9 @@ const int Dynamic = -1;
  */
 const int DynamicIndex = 0xffffff;
 
-/** This value means that the increment to go from one value to another in a sequence is not constant for each step.
+/** This value means that the requested value is not defined.
  */
-const int UndefinedIncr = 0xfffffe;
+const int Undefined = 0xfffffe;
 
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
  * The value Infinity there means the L-infinity norm.
@@ -152,7 +152,7 @@ const unsigned int LvalueBit = 0x20;
  * Means that the underlying array of coefficients can be directly accessed as a plain strided array. The memory layout
  * of the array of coefficients must be exactly the natural one suggested by rows(), cols(),
  * outerStride(), innerStride(), and the RowMajorBit. This rules out expressions such as Diagonal, whose coefficients,
- * though referencable, do not have such a regular memory layout.
+ * though referenceable, do not have such a regular memory layout.
  *
  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
  */
@@ -474,6 +474,7 @@ enum Type {
   MSA = 0x5,
   SVE = 0x6,
   HVX = 0x7,
+  LSX = 0x8,
 #if defined EIGEN_VECTORIZE_SSE
   Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -488,6 +489,8 @@ enum Type {
   Target = MSA
 #elif defined EIGEN_VECTORIZE_HVX
   Target = HVX
+#elif defined EIGEN_VECTORIZE_LSX
+  Target = LSX
 #else
   Target = Generic
 #endif
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 32a427d..ab0c542 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -83,7 +83,7 @@
 #endif
 #endif
 
-#if defined __NVCC__
+#if defined __NVCC__ && defined __CUDACC__
 // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
 // we instead use Microsoft's __pragma extension.
 #if defined _MSC_VER
diff --git a/Eigen/src/Core/util/EmulateArray.h b/Eigen/src/Core/util/EmulateArray.h
index f2fd10b..6c4c22d 100644
--- a/Eigen/src/Core/util/EmulateArray.h
+++ b/Eigen/src/Core/util/EmulateArray.h
@@ -248,15 +248,15 @@ namespace internal {
 #endif
 
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T& array_get(std::array<T, N>& a) {
+constexpr T& array_get(std::array<T, N>& a) {
   return (T&)STD_GET_ARR_HACK;
 }
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T&& array_get(std::array<T, N>&& a) {
+constexpr T&& array_get(std::array<T, N>&& a) {
   return (T&&)STD_GET_ARR_HACK;
 }
 template <std::size_t I_, class T, std::size_t N>
-constexpr inline T const& array_get(std::array<T, N> const& a) {
+constexpr T const& array_get(std::array<T, N> const& a) {
   return (T const&)STD_GET_ARR_HACK;
 }
 
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index c312939..e0bc57e 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -91,6 +91,8 @@ template <typename XprType, typename RowIndices, typename ColIndices>
 class IndexedView;
 template <typename XprType, int Rows = Dynamic, int Cols = Dynamic, int Order = 0>
 class Reshaped;
+template <typename FirstType, typename SizeType, typename IncrType>
+class ArithmeticSequence;
 
 template <typename MatrixType, int Size = Dynamic>
 class VectorBlock;
@@ -169,6 +171,8 @@ template <typename MatrixType, unsigned int Mode>
 class TriangularView;
 template <typename MatrixType, unsigned int Mode>
 class SelfAdjointView;
+template <typename Derived>
+class RealView;
 template <typename MatrixType>
 class SparseView;
 template <typename ExpressionType>
@@ -395,8 +399,6 @@ template <typename Scalar_, int Rows_, int Cols_,
                                                                    : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
           int MaxRows_ = Rows_, int MaxCols_ = Cols_>
 class Array;
-template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select;
 template <typename MatrixType, typename BinaryOp, int Direction>
 class PartialReduxExpr;
 template <typename ExpressionType, int Direction>
@@ -495,11 +497,28 @@ class MatrixComplexPowerReturnValue;
 namespace internal {
 template <typename Scalar>
 struct stem_function {
-  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
 }  // namespace internal
 
+template <typename XprType, typename Device>
+struct DeviceWrapper;
+
+namespace internal {
+template <typename Xpr>
+struct eigen_fill_helper;
+template <typename Xpr, bool use_fill = eigen_fill_helper<Xpr>::value>
+struct eigen_fill_impl;
+template <typename Xpr>
+struct eigen_memset_helper;
+template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
+struct eigen_zero_impl;
+
+template <typename Packet>
+struct has_packet_segment : std::false_type {};
+}  // namespace internal
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_FORWARDDECLARATIONS_H
diff --git a/Eigen/src/Core/util/GpuHipCudaDefines.inc b/Eigen/src/Core/util/GpuHipCudaDefines.inc
new file mode 100644
index 0000000..4e10500
--- /dev/null
+++ b/Eigen/src/Core/util/GpuHipCudaDefines.inc
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemset2DAsync hipMemset2DAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemset2DAsync cudaMemset2DAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) eigen_assert(COND)
+#endif
+
+#endif  // gpu_assert
+
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
diff --git a/Eigen/src/Core/util/GpuHipCudaUndefines.inc b/Eigen/src/Core/util/GpuHipCudaUndefines.inc
new file mode 100644
index 0000000..342a323
--- /dev/null
+++ b/Eigen/src/Core/util/GpuHipCudaUndefines.inc
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemset2DAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif  // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
+
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index 9d1b348..abf4b19 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -17,6 +17,9 @@ namespace Eigen {
 
 namespace internal {
 struct symbolic_last_tag {};
+
+struct all_t {};
+
 }  // namespace internal
 
 namespace placeholders {
@@ -42,171 +45,442 @@ typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
  *
  * \sa end
  */
-static const last_t last;
+static constexpr const last_t last;
+
+typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,
+                          symbolic::ValueExpr<Eigen::internal::FixedInt<1>>>
+    lastp1_t;
+typedef Eigen::internal::all_t all_t;
+
+/** \var lastp1
+ * \ingroup Core_Module
+ *
+ * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
+ * reference the last+1 element/row/columns of the underlying vector or matrix once
+ * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+ *
+ * This symbolic placeholder supports standard arithmetic operations.
+ * It is essentially an alias to last+fix<1>.
+ *
+ * \sa last
+ */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+static constexpr auto lastp1 = last + fix<1>;
+#else
+// Using a FixedExpr<1> expression is important here to make sure the compiler
+// can fully optimize the computation starting indices with zero overhead.
+static constexpr lastp1_t lastp1 = lastp1_t{};
+#endif
+
+/** \var end
+ * \ingroup Core_Module
+ * \sa lastp1
+ */
+static constexpr lastp1_t end = lastp1;
+
+/** \var all
+ * \ingroup Core_Module
+ * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or
+ * columns
+ */
+static constexpr Eigen::internal::all_t all;
 
 }  // namespace placeholders
 
 namespace internal {
 
-// Replace symbolic last/end "keywords" by their true runtime value
-inline Index eval_expr_given_size(Index x, Index /* size */) { return x; }
-
-template <int N>
-FixedInt<N> eval_expr_given_size(FixedInt<N> x, Index /*size*/) {
-  return x;
-}
+// Evaluate a symbolic expression or constant given the "size" of an object, allowing
+// any symbols like `last` to be evaluated.  The default here assumes a dynamic constant.
+template <typename Expr, int SizeAtCompileTime, typename EnableIf = void>
+struct SymbolicExpressionEvaluator {
+  static constexpr Index ValueAtCompileTime = Undefined;
+  static Index eval(const Expr& expr, Index /*size*/) { return static_cast<Index>(expr); }
+};
 
-template <typename Derived>
-Index eval_expr_given_size(const symbolic::BaseExpr<Derived>& x, Index size) {
-  return x.derived().eval(Eigen::placeholders::last = size - 1);
-}
+// Symbolic expression with size known at compile-time.
+template <typename Expr, int SizeAtCompileTime>
+struct SymbolicExpressionEvaluator<Expr, SizeAtCompileTime, std::enable_if_t<symbolic::is_symbolic<Expr>::value>> {
+  static constexpr Index ValueAtCompileTime =
+      Expr::Derived::eval_at_compile_time(Eigen::placeholders::last = fix<SizeAtCompileTime - 1>);
+  static Index eval(const Expr& expr, Index /*size*/) {
+    return expr.eval(Eigen::placeholders::last = fix<SizeAtCompileTime - 1>);
+  }
+};
 
-// Extract increment/step at compile time
-template <typename T, typename EnableIf = void>
-struct get_compile_time_incr {
-  enum { value = UndefinedIncr };
+// Symbolic expression with dynamic size.
+template <typename Expr>
+struct SymbolicExpressionEvaluator<Expr, Dynamic, std::enable_if_t<symbolic::is_symbolic<Expr>::value>> {
+  static constexpr Index ValueAtCompileTime = Undefined;
+  static Index eval(const Expr& expr, Index size) { return expr.eval(Eigen::placeholders::last = size - 1); }
 };
 
-template <typename T>
-constexpr Index get_runtime_incr(const T&) EIGEN_NOEXCEPT {
-  return Index(1);
-}
+// Fixed int.
+template <int N, int SizeAtCompileTime>
+struct SymbolicExpressionEvaluator<FixedInt<N>, SizeAtCompileTime, void> {
+  static constexpr Index ValueAtCompileTime = static_cast<Index>(N);
+  static Index eval(const FixedInt<N>& /*expr*/, Index /*size*/) { return ValueAtCompileTime; }
+};
 
-// Analogue of std::get<0>(x), but tailored for our needs.
-template <typename T>
-EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT {
-  return x.first();
-}
+//--------------------------------------------------------------------------------
+// Handling of generic indices (e.g. array)
+//--------------------------------------------------------------------------------
 
-// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by
-// MatrixSlice The generic implementation is a no-op
-template <typename T, int XprSize, typename EnableIf = void>
-struct IndexedViewCompatibleType {
-  typedef T type;
+// Potentially wrap indices in a type that is better-suited for IndexedView evaluation.
+template <typename Indices, int NestedSizeAtCompileTime, typename EnableIf = void>
+struct IndexedViewHelperIndicesWrapper {
+  using type = Indices;
+  static const type& CreateIndexSequence(const Indices& indices, Index /*nested_size*/) { return indices; }
 };
 
-template <typename T, typename Q>
-const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) {
-  return x;
-}
+// Extract compile-time and runtime first, size, increments.
+template <typename Indices, typename EnableIf = void>
+struct IndexedViewHelper {
+  static constexpr Index FirstAtCompileTime = Undefined;
+  static constexpr Index SizeAtCompileTime = array_size<Indices>::value;
+  static constexpr Index IncrAtCompileTime = Undefined;
+
+  static constexpr Index first(const Indices& indices) { return static_cast<Index>(indices[0]); }
+  static constexpr Index size(const Indices& indices) { return index_list_size(indices); }
+  static constexpr Index incr(const Indices& /*indices*/) { return Undefined; }
+};
 
 //--------------------------------------------------------------------------------
-// Handling of a single Index
+// Handling of ArithmeticSequence
 //--------------------------------------------------------------------------------
 
-struct SingleRange {
-  enum { SizeAtCompileTime = 1 };
-  SingleRange(Index val) : m_value(val) {}
-  Index operator[](Index) const { return m_value; }
-  static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; }
-  Index first() const EIGEN_NOEXCEPT { return m_value; }
-  Index m_value;
+template <Index FirstAtCompileTime_, Index SizeAtCompileTime_, Index IncrAtCompileTime_>
+class ArithmeticSequenceRange {
+ public:
+  static constexpr Index FirstAtCompileTime = FirstAtCompileTime_;
+  static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
+  static constexpr Index IncrAtCompileTime = IncrAtCompileTime_;
+
+  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : first_{first}, size_{size}, incr_{incr} {}
+  constexpr Index operator[](Index i) const { return first() + i * incr(); }
+  constexpr Index first() const noexcept { return first_.value(); }
+  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index incr() const noexcept { return incr_.value(); }
+
+ private:
+  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> first_;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
+  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> incr_;
 };
 
-template <>
-struct get_compile_time_incr<SingleRange> {
-  enum { value = 1 };  // 1 or 0 ??
+template <typename FirstType, typename SizeType, typename IncrType, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<ArithmeticSequence<FirstType, SizeType, IncrType>, NestedSizeAtCompileTime,
+                                       void> {
+  static constexpr Index EvalFirstAtCompileTime =
+      SymbolicExpressionEvaluator<FirstType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index EvalSizeAtCompileTime =
+      SymbolicExpressionEvaluator<SizeType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index EvalIncrAtCompileTime =
+      SymbolicExpressionEvaluator<IncrType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+
+  static constexpr Index FirstAtCompileTime =
+      (int(EvalFirstAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalFirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime =
+      (int(EvalSizeAtCompileTime) == Undefined) ? Index(Dynamic) : EvalSizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime =
+      (int(EvalIncrAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalIncrAtCompileTime;
+
+  using Indices = ArithmeticSequence<FirstType, SizeType, IncrType>;
+  using type = ArithmeticSequenceRange<FirstAtCompileTime, SizeAtCompileTime, IncrAtCompileTime>;
+
+  static type CreateIndexSequence(const Indices& indices, Index nested_size) {
+    Index first =
+        SymbolicExpressionEvaluator<FirstType, NestedSizeAtCompileTime>::eval(indices.firstObject(), nested_size);
+    Index size =
+        SymbolicExpressionEvaluator<SizeType, NestedSizeAtCompileTime>::eval(indices.sizeObject(), nested_size);
+    Index incr =
+        SymbolicExpressionEvaluator<IncrType, NestedSizeAtCompileTime>::eval(indices.incrObject(), nested_size);
+    return type(first, size, incr);
+  }
 };
 
-// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int)
-// methods)
-template <typename T, int XprSize>
-struct IndexedViewCompatibleType<T, XprSize, std::enable_if_t<internal::is_integral<T>::value>> {
-  // Here we could simply use Array, but maybe it's less work for the compiler to use
-  // a simpler wrapper as SingleRange
-  // typedef Eigen::Array<Index,1,1> type;
-  typedef SingleRange type;
+template <Index FirstAtCompileTime_, Index SizeAtCompileTime_, Index IncrAtCompileTime_>
+struct IndexedViewHelper<ArithmeticSequenceRange<FirstAtCompileTime_, SizeAtCompileTime_, IncrAtCompileTime_>, void> {
+ public:
+  using Indices = ArithmeticSequenceRange<FirstAtCompileTime_, SizeAtCompileTime_, IncrAtCompileTime_>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+  static Index first(const Indices& indices) { return indices.first(); }
+  static Index size(const Indices& indices) { return indices.size(); }
+  static Index incr(const Indices& indices) { return indices.incr(); }
 };
 
-template <typename T, int XprSize>
-struct IndexedViewCompatibleType<T, XprSize, std::enable_if_t<symbolic::is_symbolic<T>::value>> {
-  typedef SingleRange type;
+//--------------------------------------------------------------------------------
+// Handling of a single index.
+//--------------------------------------------------------------------------------
+
+template <Index ValueAtCompileTime>
+class SingleRange {
+ public:
+  static constexpr Index FirstAtCompileTime = ValueAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Index(1);
+  static constexpr Index IncrAtCompileTime = Index(1);  // Needs to be 1 to be treated as block-like.
+
+  constexpr SingleRange(Index v) noexcept : value_(v) {}
+  constexpr Index operator[](Index) const noexcept { return first(); }
+  constexpr Index first() const noexcept { return value_.value(); }
+  constexpr Index size() const noexcept { return SizeAtCompileTime; }
+  constexpr Index incr() const noexcept { return IncrAtCompileTime; }
+
+ private:
+  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> value_;
 };
 
 template <typename T>
-std::enable_if_t<symbolic::is_symbolic<T>::value, SingleRange> makeIndexedViewCompatible(const T& id, Index size,
-                                                                                         SpecializedType) {
-  return eval_expr_given_size(id, size);
-}
+struct is_single_range : public std::false_type {};
+
+template <Index ValueAtCompileTime>
+struct is_single_range<SingleRange<ValueAtCompileTime>> : public std::true_type {};
+
+template <typename SingleIndex, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<
+    SingleIndex, NestedSizeAtCompileTime,
+    std::enable_if_t<std::is_integral<SingleIndex>::value || symbolic::is_symbolic<SingleIndex>::value>> {
+  static constexpr Index EvalValueAtCompileTime =
+      SymbolicExpressionEvaluator<SingleIndex, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index ValueAtCompileTime =
+      (int(EvalValueAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalValueAtCompileTime;
+  using type = SingleRange<ValueAtCompileTime>;
+  static type CreateIndexSequence(const SingleIndex& index, Index nested_size) {
+    return type(SymbolicExpressionEvaluator<SingleIndex, NestedSizeAtCompileTime>::eval(index, nested_size));
+  }
+};
+
+template <int N, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<FixedInt<N>, NestedSizeAtCompileTime, void> {
+  using type = SingleRange<Index(N)>;
+  static type CreateIndexSequence(const FixedInt<N>& /*index*/) { return type(Index(N)); }
+};
+
+template <Index ValueAtCompileTime>
+struct IndexedViewHelper<SingleRange<ValueAtCompileTime>, void> {
+  using Indices = SingleRange<ValueAtCompileTime>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+
+  static constexpr Index first(const Indices& indices) { return indices.first(); }
+  static constexpr Index size(const Indices& /*indices*/) { return SizeAtCompileTime; }
+  static constexpr Index incr(const Indices& /*indices*/) { return IncrAtCompileTime; }
+};
 
 //--------------------------------------------------------------------------------
 // Handling of all
 //--------------------------------------------------------------------------------
 
-struct all_t {
-  all_t() {}
+// Convert a symbolic 'all' into a usable range type
+template <Index SizeAtCompileTime_>
+class AllRange {
+ public:
+  static constexpr Index FirstAtCompileTime = Index(0);
+  static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
+  static constexpr Index IncrAtCompileTime = Index(1);
+  constexpr AllRange(Index size) : size_(size) {}
+  constexpr Index operator[](Index i) const noexcept { return i; }
+  constexpr Index first() const noexcept { return FirstAtCompileTime; }
+  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index incr() const noexcept { return IncrAtCompileTime; }
+
+ private:
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
 };
 
-// Convert a symbolic 'all' into a usable range type
-template <int XprSize>
-struct AllRange {
-  enum { SizeAtCompileTime = XprSize };
-  AllRange(Index size = XprSize) : m_size(size) {}
-  EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; }
-  EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); }
-  EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; }
-  variable_if_dynamic<Index, XprSize> m_size;
+template <int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<all_t, NestedSizeAtCompileTime, void> {
+  using type = AllRange<Index(NestedSizeAtCompileTime)>;
+  static type CreateIndexSequence(const all_t& /*indices*/, Index nested_size) { return type(nested_size); }
 };
 
-template <int XprSize>
-struct IndexedViewCompatibleType<all_t, XprSize> {
-  typedef AllRange<XprSize> type;
+template <Index SizeAtCompileTime_>
+struct IndexedViewHelper<AllRange<SizeAtCompileTime_>, void> {
+  using Indices = AllRange<SizeAtCompileTime_>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+
+  static Index first(const Indices& indices) { return indices.first(); }
+  static Index size(const Indices& indices) { return indices.size(); }
+  static Index incr(const Indices& indices) { return indices.incr(); }
 };
 
-template <typename XprSizeType>
-inline AllRange<get_fixed_value<XprSizeType>::value> makeIndexedViewCompatible(all_t, XprSizeType size,
-                                                                               SpecializedType) {
-  return AllRange<get_fixed_value<XprSizeType>::value>(size);
+// this helper class assumes internal::valid_indexed_view_overload<RowIndices, ColIndices>::value == true
+template <typename Derived, typename RowIndices, typename ColIndices, typename EnableIf = void>
+struct IndexedViewSelector;
+
+template <typename Indices, int SizeAtCompileTime>
+using IvcType = typename internal::IndexedViewHelperIndicesWrapper<Indices, SizeAtCompileTime>::type;
+
+template <int SizeAtCompileTime, typename Indices>
+inline IvcType<Indices, SizeAtCompileTime> CreateIndexSequence(size_t size, const Indices& indices) {
+  return internal::IndexedViewHelperIndicesWrapper<Indices, SizeAtCompileTime>::CreateIndexSequence(indices, size);
 }
 
-template <int Size>
-struct get_compile_time_incr<AllRange<Size>> {
-  enum { value = 1 };
+// Generic
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<Derived, RowIndices, ColIndices,
+                           std::enable_if_t<internal::traits<
+                               IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                           IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsIndexedView>> {
+  using ReturnType = IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                 IvcType<ColIndices, Derived::ColsAtCompileTime>>;
+  using ConstReturnType = IndexedView<const Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                      IvcType<ColIndices, Derived::ColsAtCompileTime>>;
+
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    return ReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices),
+                      CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    return ConstReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices),
+                           CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices));
+  }
 };
 
-}  // end namespace internal
-
-namespace placeholders {
+// Block
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<
+    Derived, RowIndices, ColIndices,
+    std::enable_if_t<internal::traits<IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                                  IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsBlock>> {
+  using ActualRowIndices = IvcType<RowIndices, Derived::RowsAtCompileTime>;
+  using ActualColIndices = IvcType<ColIndices, Derived::ColsAtCompileTime>;
+  using IndexedViewType = IndexedView<Derived, ActualRowIndices, ActualColIndices>;
+  using ConstIndexedViewType = IndexedView<const Derived, ActualRowIndices, ActualColIndices>;
+  using ReturnType = typename internal::traits<IndexedViewType>::BlockType;
+  using ConstReturnType = typename internal::traits<ConstIndexedViewType>::BlockType;
+  using RowHelper = internal::IndexedViewHelper<ActualRowIndices>;
+  using ColHelper = internal::IndexedViewHelper<ActualColIndices>;
+
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return ReturnType(derived, RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices),
+                      RowHelper::size(actualRowIndices), ColHelper::size(actualColIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return ConstReturnType(derived, RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices),
+                           RowHelper::size(actualRowIndices), ColHelper::size(actualColIndices));
+  }
+};
 
-typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,
-                          symbolic::ValueExpr<Eigen::internal::FixedInt<1>>>
-    lastp1_t;
-typedef Eigen::internal::all_t all_t;
+// Scalar
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<
+    Derived, RowIndices, ColIndices,
+    std::enable_if_t<internal::traits<IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                                  IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsScalar>> {
+  using ReturnType = typename DenseBase<Derived>::Scalar&;
+  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
+  using ActualRowIndices = IvcType<RowIndices, Derived::RowsAtCompileTime>;
+  using ActualColIndices = IvcType<ColIndices, Derived::ColsAtCompileTime>;
+  using RowHelper = internal::IndexedViewHelper<ActualRowIndices>;
+  using ColHelper = internal::IndexedViewHelper<ActualColIndices>;
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return derived(RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return derived(RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices));
+  }
+};
 
-/** \var lastp1
- * \ingroup Core_Module
- *
- * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
- * reference the last+1 element/row/columns of the underlying vector or matrix once
- * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
- *
- * This symbolic placeholder supports standard arithmetic operations.
- * It is essentially an alias to last+fix<1>.
- *
- * \sa last
- */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-static const auto lastp1 = last + fix<1>;
-#else
-// Using a FixedExpr<1> expression is important here to make sure the compiler
-// can fully optimize the computation starting indices with zero overhead.
-static const lastp1_t lastp1(last + fix<1>());
-#endif
+// this helper class assumes internal::is_valid_index_type<Indices>::value == false
+template <typename Derived, typename Indices, typename EnableIf = void>
+struct VectorIndexedViewSelector;
+
+// Generic
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<!internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value &&
+                     internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>::IncrAtCompileTime !=
+                         1>> {
+  static constexpr bool IsRowMajor = DenseBase<Derived>::IsRowMajor;
+  using ZeroIndex = internal::SingleRange<Index(0)>;
+  using RowMajorReturnType = IndexedView<Derived, ZeroIndex, IvcType<Indices, Derived::SizeAtCompileTime>>;
+  using ConstRowMajorReturnType = IndexedView<const Derived, ZeroIndex, IvcType<Indices, Derived::SizeAtCompileTime>>;
+
+  using ColMajorReturnType = IndexedView<Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
+  using ConstColMajorReturnType = IndexedView<const Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
+
+  using ReturnType = typename internal::conditional<IsRowMajor, RowMajorReturnType, ColMajorReturnType>::type;
+  using ConstReturnType =
+      typename internal::conditional<IsRowMajor, ConstRowMajorReturnType, ConstColMajorReturnType>::type;
+
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
+  static inline RowMajorReturnType run(Derived& derived, const Indices& indices) {
+    return RowMajorReturnType(derived, ZeroIndex(0),
+                              CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), indices));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
+  static inline ConstRowMajorReturnType run(const Derived& derived, const Indices& indices) {
+    return ConstRowMajorReturnType(derived, ZeroIndex(0),
+                                   CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), indices));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
+  static inline ColMajorReturnType run(Derived& derived, const Indices& indices) {
+    return ColMajorReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), indices),
+                              ZeroIndex(0));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
+  static inline ConstColMajorReturnType run(const Derived& derived, const Indices& indices) {
+    return ConstColMajorReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), indices),
+                                   ZeroIndex(0));
+  }
+};
 
-/** \var end
- * \ingroup Core_Module
- * \sa lastp1
- */
-static const lastp1_t end = lastp1;
+// Block
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<!internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value &&
+                     internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>::IncrAtCompileTime ==
+                         1>> {
+  using Helper = internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>;
+  using ReturnType = VectorBlock<Derived, Helper::SizeAtCompileTime>;
+  using ConstReturnType = VectorBlock<const Derived, Helper::SizeAtCompileTime>;
+  static inline ReturnType run(Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return ReturnType(derived, Helper::first(actualIndices), Helper::size(actualIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return ConstReturnType(derived, Helper::first(actualIndices), Helper::size(actualIndices));
+  }
+};
 
-/** \var all
- * \ingroup Core_Module
- * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or
- * columns
- */
-static const Eigen::internal::all_t all;
+// Symbolic
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value>> {
+  using ReturnType = typename DenseBase<Derived>::Scalar&;
+  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
+  using Helper = internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>;
+  static inline ReturnType run(Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return derived(Helper::first(actualIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return derived(Helper::first(actualIndices));
+  }
+};
 
-}  // namespace placeholders
+}  // end namespace internal
 
 }  // end namespace Eigen
 
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 279d553..53fabd5 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -54,65 +54,60 @@ class VariableAndFixedInt;
 template <int N>
 class FixedInt {
  public:
-  static const int value = N;
-  EIGEN_CONSTEXPR operator int() const { return value; }
-
-  EIGEN_CONSTEXPR
-  FixedInt() = default;
+  static constexpr int value = N;
+  constexpr operator int() const { return N; }
 
-  EIGEN_CONSTEXPR
-  FixedInt(std::integral_constant<int, N>) {}
+  constexpr FixedInt() = default;
+  constexpr FixedInt(std::integral_constant<int, N>) {}
 
-  EIGEN_CONSTEXPR
-  FixedInt(VariableAndFixedInt<N> other) {
+  constexpr FixedInt(VariableAndFixedInt<N> other) {
 #ifndef EIGEN_INTERNAL_DEBUGGING
     EIGEN_UNUSED_VARIABLE(other);
 #endif
     eigen_internal_assert(int(other) == N);
   }
 
-  EIGEN_CONSTEXPR
-  FixedInt<-N> operator-() const { return FixedInt<-N>(); }
+  constexpr FixedInt<-N> operator-() const { return FixedInt<-N>(); }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N + M> operator+(FixedInt<M>) const {
+  constexpr FixedInt<N + M> operator+(FixedInt<M>) const {
     return FixedInt<N + M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N - M> operator-(FixedInt<M>) const {
+  constexpr FixedInt<N - M> operator-(FixedInt<M>) const {
     return FixedInt<N - M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N * M> operator*(FixedInt<M>) const {
+  constexpr FixedInt<N * M> operator*(FixedInt<M>) const {
     return FixedInt<N * M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N / M> operator/(FixedInt<M>) const {
+  constexpr FixedInt<N / M> operator/(FixedInt<M>) const {
     return FixedInt<N / M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N % M> operator%(FixedInt<M>) const {
+  constexpr FixedInt<N % M> operator%(FixedInt<M>) const {
     return FixedInt<N % M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N | M> operator|(FixedInt<M>) const {
+  constexpr FixedInt<N | M> operator|(FixedInt<M>) const {
     return FixedInt<N | M>();
   }
 
   template <int M>
-  EIGEN_CONSTEXPR FixedInt<N & M> operator&(FixedInt<M>) const {
+  constexpr FixedInt<N & M> operator&(FixedInt<M>) const {
     return FixedInt<N & M>();
   }
 
   // Needed in C++14 to allow fix<N>():
-  EIGEN_CONSTEXPR FixedInt operator()() const { return *this; }
+  constexpr FixedInt operator()() const { return *this; }
 
-  VariableAndFixedInt<N> operator()(int val) const { return VariableAndFixedInt<N>(val); }
+  constexpr VariableAndFixedInt<N> operator()(int val) const { return VariableAndFixedInt<N>(val); }
 };
 
 /** \internal
@@ -268,8 +263,8 @@ static const auto fix();
  * }
  * \endcode
  * In this example, the function Eigen::seqN knows that the second argument is expected to be a size.
- * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed,
- * and converted to an Eigen::Index of value \c n. Otherwise, the runtime-value \c n will be dissmissed, and the
+ * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dismissed,
+ * and converted to an Eigen::Index of value \c n. Otherwise, the runtime-value \c n will be dismissed, and the
  * returned ArithmeticSequence will be of the exact same type as <tt> seqN(0,fix<N>) </tt>.
  *
  * \sa fix, seqN, class ArithmeticSequence
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 5709257..db4a630 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -17,13 +17,9 @@
 // Eigen version and basic defaults
 //------------------------------------------------------------------------------------------
 
-#define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 4
-#define EIGEN_MINOR_VERSION 90
-
 #define EIGEN_VERSION_AT_LEAST(x, y, z) \
-  (EIGEN_WORLD_VERSION > x ||           \
-   (EIGEN_WORLD_VERSION >= x && (EIGEN_MAJOR_VERSION > y || (EIGEN_MAJOR_VERSION >= y && EIGEN_MINOR_VERSION >= z))))
+  (EIGEN_MAJOR_VERSION > x ||           \
+   (EIGEN_MAJOR_VERSION >= x && (EIGEN_MINOR_VERSION > y || (EIGEN_MINOR_VERSION >= y && EIGEN_PATCH_VERSION >= z))))
 
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
@@ -196,6 +192,13 @@
 #define EIGEN_COMP_PGI 0
 #endif
 
+/// \internal EIGEN_COMP_NVHPC set to NVHPC version if the compiler is nvc++
+#if defined(__NVCOMPILER)
+#define EIGEN_COMP_NVHPC (__NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__)
+#else
+#define EIGEN_COMP_NVHPC 0
+#endif
+
 /// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
 #if defined(__CC_ARM) || defined(__ARMCC_VERSION)
 #define EIGEN_COMP_ARM 1
@@ -212,7 +215,7 @@
 
 /// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler (traditional mode)
 /// \note The Fujitsu C/C++ compiler uses the traditional mode based
-/// on EDG g++ 6.1 by default or if envoked with the -Nnoclang flag
+/// on EDG g++ 6.1 by default or if invoked with the -Nnoclang flag
 #if defined(__FUJITSU)
 #define EIGEN_COMP_FCC (__FCC_major__ * 100 + __FCC_minor__ * 10 + __FCC_patchlevel__)
 #else
@@ -221,7 +224,7 @@
 
 /// \internal EIGEN_COMP_CLANGFCC set to FCC version if the compiler is Fujitsu Compiler (Clang mode)
 /// \note The Fujitsu C/C++ compiler uses the non-traditional mode
-/// based on Clang 7.1.0 if envoked with the -Nclang flag
+/// based on Clang 7.1.0 if invoked with the -Nclang flag
 #if defined(__CLANG_FUJITSU)
 #define EIGEN_COMP_CLANGFCC (__FCC_major__ * 100 + __FCC_minor__ * 10 + __FCC_patchlevel__)
 #else
@@ -369,6 +372,13 @@
 #define EIGEN_ARCH_MIPS 0
 #endif
 
+/// \internal EIGEN_ARCH_LOONGARCH64 set to 1 if the architecture is LOONGARCH64
+#if defined(__loongarch64)
+#define EIGEN_ARCH_LOONGARCH64 1
+#else
+#define EIGEN_ARCH_LOONGARCH64 0
+#endif
+
 /// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
 #if defined(__sparc__) || defined(__sparc)
 #define EIGEN_ARCH_SPARC 1
@@ -412,6 +422,16 @@
 // note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
 #if defined(__ANDROID__) || defined(ANDROID)
 #define EIGEN_OS_ANDROID 1
+
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if defined __has_include
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif
+#endif
+
 #else
 #define EIGEN_OS_ANDROID 0
 #endif
@@ -920,6 +940,18 @@
 #define EIGEN_DEPRECATED
 #endif
 
+#ifndef EIGEN_NO_DEPRECATED_WARNING
+#if EIGEN_COMP_GNUC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __attribute__((deprecated(message)))
+#elif EIGEN_COMP_MSVC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __declspec(deprecated(message))
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+
 #if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
@@ -964,9 +996,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 // added then subtracted, which is otherwise compiled away with -ffast-math.
 //
 // See bug 1674
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_OPTIMIZATION_BARRIER(X)
+#endif
+
 #if !defined(EIGEN_OPTIMIZATION_BARRIER)
-#if EIGEN_COMP_GNUC
-   // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+// Implement the barrier on GNUC compilers or clang-cl.
+#if EIGEN_COMP_GNUC || (defined(__clang__) && defined(_MSC_VER))
+// According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
 //   X: Any operand whatsoever.
 //   r: A register operand is allowed provided that it is in a general
 //      register.
@@ -999,37 +1036,37 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
 // you will need to apply to the underlying POD type.
 #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
-   // This seems to be broken on clang. Packet4f is loaded into a single
+// This seems to be broken on clang. Packet4f is loaded into a single
 //   register rather than a vector, zeroing out some entries. Integer
 //   types also generate a compile error.
 #if EIGEN_OS_MAC
-   // General, Altivec for Apple (VSX were added in ISA v2.06):
+// General, Altivec for Apple (VSX were added in ISA v2.06):
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v"(X));
 #else
-   // General, Altivec, VSX otherwise:
+// General, Altivec, VSX otherwise:
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v,wa"(X));
 #endif
 #elif EIGEN_ARCH_ARM_OR_ARM64
 #ifdef __ARM_FP
-   // General, VFP or NEON.
+// General, VFP or NEON.
 // Clang doesn't like "r",
 //    error: non-trivial scalar-to-vector conversion, possible invalid
 //           constraint for vector typ
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,w"(X));
 #else
-   // Arm without VFP or NEON.
+// Arm without VFP or NEON.
 // "w" constraint will not compile.
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g"(X));
 #endif
 #elif EIGEN_ARCH_i386_OR_x86_64
-   // General, SSE.
+// General, SSE.
 #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,x"(X));
 #else
-   // Not implemented for other architectures.
+// Not implemented for other architectures.
 #define EIGEN_OPTIMIZATION_BARRIER(X)
 #endif
 #else
-   // Not implemented for other compilers.
+// Not implemented for other compilers.
 #define EIGEN_OPTIMIZATION_BARRIER(X)
 #endif
 #endif
@@ -1072,14 +1109,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 #define EIGEN_USING_STD(FUNC) using std::FUNC;
 #endif
 
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_NVCC
-// Wwhen compiling with NVCC, using the base operator is necessary,
-//   otherwise we get duplicate definition errors
-// For later MSVC versions, we require explicit operator= definition, otherwise we get
-//   use of implicitly deleted operator errors.
-// (cf Bugs 920, 1000, 1324, 2291)
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) using Base::operator=;
-#elif EIGEN_COMP_CLANG  // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+#if EIGEN_COMP_CLANG  // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)                                           \
   using Base::operator=;                                                                           \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) {                 \
@@ -1256,11 +1286,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons
 #define EIGEN_CATCH(X) else
 #endif
 
-#define EIGEN_NOEXCEPT noexcept
-#define EIGEN_NOEXCEPT_IF(x) noexcept(x)
-#define EIGEN_NO_THROW noexcept(true)
-#define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
-
 // The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
 namespace Eigen {
 namespace internal {
diff --git a/Eigen/src/Core/util/MaxSizeVector.h b/Eigen/src/Core/util/MaxSizeVector.h
index 2f1e3d3..db5bb89 100644
--- a/Eigen/src/Core/util/MaxSizeVector.h
+++ b/Eigen/src/Core/util/MaxSizeVector.h
@@ -13,7 +13,7 @@
 namespace Eigen {
 
 /** \class MaxSizeVector
- * \ingroup Core
+ * \ingroup Core_Module
  *
  * \brief The MaxSizeVector class.
  *
@@ -116,17 +116,17 @@ class MaxSizeVector {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return size_ == 0; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return data_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return data_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return data_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return data_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* begin() { return data_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* begin() { return data_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* end() { return data_ + size_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* end() { return data_ + size_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* begin() const { return data_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* begin() const { return data_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* end() const { return data_ + size_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* end() const { return data_ + size_; }
 
  private:
   size_t reserve_;
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 6253454..d6c09a3 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -141,23 +141,24 @@ EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() {
  */
 EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size,
                                                        std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) {
-  eigen_assert(alignment >= sizeof(void*) && alignment <= 128 && (alignment & (alignment - 1)) == 0 &&
-               "Alignment must be at least sizeof(void*), less than or equal to 128, and a power of 2");
+  eigen_assert(alignment >= sizeof(void*) && alignment <= 256 && (alignment & (alignment - 1)) == 0 &&
+               "Alignment must be at least sizeof(void*), less than or equal to 256, and a power of 2");
 
   check_that_malloc_is_allowed();
   EIGEN_USING_STD(malloc)
   void* original = malloc(size + alignment);
-  if (original == 0) return 0;
-  uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
+  if (original == nullptr) return nullptr;
+  std::size_t offset = alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1));
   void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
-  *(static_cast<uint8_t*>(aligned) - 1) = offset;
+  // Store offset - 1, since it is guaranteed to be at least 1.
+  *(static_cast<uint8_t*>(aligned) - 1) = static_cast<uint8_t>(offset - 1);
   return aligned;
 }
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
 EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
   if (ptr != nullptr) {
-    uint8_t offset = static_cast<uint8_t>(*(static_cast<uint8_t*>(ptr) - 1));
+    std::size_t offset = static_cast<std::size_t>(*(static_cast<uint8_t*>(ptr) - 1)) + 1;
     void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
 
     check_that_malloc_is_allowed();
@@ -174,7 +175,7 @@ EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
 EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size,
                                                         std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) {
   if (ptr == nullptr) return handmade_aligned_malloc(new_size, alignment);
-  uint8_t old_offset = *(static_cast<uint8_t*>(ptr) - 1);
+  std::size_t old_offset = static_cast<std::size_t>(*(static_cast<uint8_t*>(ptr) - 1)) + 1;
   void* old_original = static_cast<uint8_t*>(ptr) - old_offset;
 
   check_that_malloc_is_allowed();
@@ -182,14 +183,15 @@ EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t n
   void* original = realloc(old_original, new_size + alignment);
   if (original == nullptr) return nullptr;
   if (original == old_original) return ptr;
-  uint8_t offset = static_cast<uint8_t>(alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1)));
+  std::size_t offset = alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1));
   void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
   if (offset != old_offset) {
     const void* src = static_cast<const void*>(static_cast<uint8_t*>(original) + old_offset);
     std::size_t count = (std::min)(new_size, old_size);
     std::memmove(aligned, src, count);
   }
-  *(static_cast<uint8_t*>(aligned) - 1) = offset;
+  // Store offset - 1, since it is guaranteed to be at least 1.
+  *(static_cast<uint8_t*>(aligned) - 1) = static_cast<uint8_t>(offset - 1);
   return aligned;
 }
 
@@ -391,7 +393,8 @@ EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T* ptr, T* src, std
 
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) {
-  if (size > std::size_t(-1) / sizeof(T)) throw_std_bad_alloc();
+  constexpr std::size_t max_elements = (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T);
+  if (size > max_elements) throw_std_bad_alloc();
 }
 
 /** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
@@ -473,7 +476,7 @@ EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t
 
 template <typename T, bool Align>
 EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size) {
-  if (size == 0) return 0;  // short-cut. Also fixes Bug 884
+  if (size == 0) return nullptr;  // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
   T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T) * size));
   if (NumTraits<T>::RequireInitialization) {
@@ -759,27 +762,37 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
  * This is accomplished through alloca if this later is supported and if the required number of bytes
  * is below EIGEN_STACK_ALLOCATION_LIMIT.
  */
-#ifdef EIGEN_ALLOCA
+#if defined(EIGEN_ALLOCA) && !defined(EIGEN_NO_ALLOCA)
 
 #if EIGEN_DEFAULT_ALIGN_BYTES > 0
-   // We always manually re-align the result of EIGEN_ALLOCA.
+// We always manually re-align the result of EIGEN_ALLOCA.
 // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
-#define EIGEN_ALIGNED_ALLOCA(SIZE)                                                                           \
-  reinterpret_cast<void*>(                                                                                   \
-      (std::uintptr_t(EIGEN_ALLOCA(SIZE + EIGEN_DEFAULT_ALIGN_BYTES - 1)) + EIGEN_DEFAULT_ALIGN_BYTES - 1) & \
-      ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1)))
+
+#if ((EIGEN_COMP_GNUC || EIGEN_COMP_CLANG) && !EIGEN_COMP_NVHPC)
+#define EIGEN_ALIGNED_ALLOCA(SIZE) __builtin_alloca_with_align(SIZE, CHAR_BIT* EIGEN_DEFAULT_ALIGN_BYTES)
+#else
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* ptr) {
+  constexpr std::uintptr_t mask = EIGEN_DEFAULT_ALIGN_BYTES - 1;
+  std::uintptr_t ptr_int = std::uintptr_t(ptr);
+  std::uintptr_t aligned_ptr_int = (ptr_int + mask) & ~mask;
+  std::uintptr_t offset = aligned_ptr_int - ptr_int;
+  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+}
+#define EIGEN_ALIGNED_ALLOCA(SIZE) eigen_aligned_alloca_helper(EIGEN_ALLOCA(SIZE + EIGEN_DEFAULT_ALIGN_BYTES - 1))
+#endif
+
 #else
 #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
 #endif
 
-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                     \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                             \
-  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                             \
-                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * SIZE <= EIGEN_STACK_ALLOCATION_LIMIT)        \
-                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * SIZE)              \
-                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                    \
-      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * SIZE > EIGEN_STACK_ALLOCATION_LIMIT)
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                       \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                               \
+  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                               \
+                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * (SIZE) <= EIGEN_STACK_ALLOCATION_LIMIT)        \
+                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * (SIZE))              \
+                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                      \
+      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * (SIZE) > EIGEN_STACK_ALLOCATION_LIMIT)
 
 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME)                                        \
   Eigen::internal::local_nested_eval_wrapper<XPR_T, N> EIGEN_CAT(NAME, _wrapper)(                \
@@ -792,10 +805,11 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
 
 #else
 
-#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                        \
-  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                                \
-  TYPE* NAME = (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * SIZE)); \
-  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                       \
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                 \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                         \
+  TYPE* NAME =                                                                                                  \
+      (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                \
       (BUFFER) == 0 ? NAME : 0, SIZE, true)
 
 #define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \
@@ -820,46 +834,44 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
 
 // HIP does not support new/delete on device.
 #if EIGEN_MAX_ALIGN_BYTES != 0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                    \
-  EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
-    EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); }        \
-    EIGEN_CATCH(...) { return 0; }                                                               \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) noexcept { \
+    EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); }  \
+    EIGEN_CATCH(...) { return 0; }                                                         \
   }
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)                                                             \
-  EIGEN_DEVICE_FUNC void* operator new(std::size_t size) {                                                           \
-    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                          \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) {                                                         \
-    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                          \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr) EIGEN_NO_THROW {                                                 \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* ptr) EIGEN_NO_THROW {                                               \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) EIGEN_NO_THROW {                           \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) EIGEN_NO_THROW {                         \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
-  /* in-place new and delete. since (at least afaik) there is no actual   */                                         \
-  /* memory allocated we can safely let the default implementation handle */                                         \
-  /* this particular case. */                                                                                        \
-  EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); }     \
-  EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); } \
-  EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) EIGEN_NO_THROW {                                   \
-    return ::operator delete(memory, ptr);                                                                           \
-  }                                                                                                                  \
-  EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) EIGEN_NO_THROW {                                 \
-    return ::operator delete[](memory, ptr);                                                                         \
-  }                                                                                                                  \
-  /* nothrow-new (returns zero instead of std::bad_alloc) */                                                         \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                                              \
-  EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) EIGEN_NO_THROW {                          \
-    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                    \
-  }                                                                                                                  \
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)                                                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size) {                                                            \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) {                                                          \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr) noexcept {                                                        \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr) noexcept {                                                      \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) noexcept {                                  \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) noexcept {                                \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  /* in-place new and delete. since (at least afaik) there is no actual   */                                          \
+  /* memory allocated we can safely let the default implementation handle */                                          \
+  /* this particular case. */                                                                                         \
+  EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); }      \
+  EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); }  \
+  EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) noexcept { return ::operator delete(memory, ptr); } \
+  EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) noexcept {                                        \
+    return ::operator delete[](memory, ptr);                                                                          \
+  }                                                                                                                   \
+  /* nothrow-new (returns zero instead of std::bad_alloc) */                                                          \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                                               \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) noexcept {                                 \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
   typedef void eigen_aligned_operator_new_marker_type;
 #else
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
@@ -902,7 +914,7 @@ void swap(scoped_array<T>& a, scoped_array<T>& b) {
  * \sa \blank \ref TopicStlContainers.
  */
 template <class T>
-class aligned_allocator : public std::allocator<T> {
+class aligned_allocator {
  public:
   typedef std::size_t size_type;
   typedef std::ptrdiff_t difference_type;
@@ -917,14 +929,21 @@ class aligned_allocator : public std::allocator<T> {
     typedef aligned_allocator<U> other;
   };
 
-  aligned_allocator() : std::allocator<T>() {}
+  aligned_allocator() = default;
 
-  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
+  aligned_allocator(const aligned_allocator&) = default;
 
   template <class U>
-  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
+  aligned_allocator(const aligned_allocator<U>&) {}
 
-  ~aligned_allocator() {}
+  template <class U>
+  constexpr bool operator==(const aligned_allocator<U>&) const noexcept {
+    return true;
+  }
+  template <class U>
+  constexpr bool operator!=(const aligned_allocator<U>&) const noexcept {
+    return false;
+  }
 
 #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)
   // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
@@ -1296,7 +1315,8 @@ inline int queryTopLevelCacheSize() {
  * This wraps C++20's std::construct_at, using placement new instead if it is not available.
  */
 
-#if EIGEN_COMP_CXXVER >= 20
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_constexpr_dynamic_alloc) && \
+    __cpp_lib_constexpr_dynamic_alloc >= 201907L
 using std::construct_at;
 #else
 template <class T, class... Args>
@@ -1319,6 +1339,21 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) {
 }
 #endif
 
+/** \internal
+ * This informs the implementation that PTR is aligned to at least ALIGN_BYTES
+ */
+#ifndef EIGEN_ASSUME_ALIGNED
+#if defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L)
+#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \
+  { PTR = std::assume_aligned<8 * (ALIGN_BYTES)>(PTR); }
+#elif EIGEN_HAS_BUILTIN(__builtin_assume_aligned)
+#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \
+  { PTR = static_cast<decltype(PTR)>(__builtin_assume_aligned(PTR, (ALIGN_BYTES))); }
+#else
+#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) /* do nothing */
+#endif
+#endif
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index d2336ce..ddbc898 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -79,7 +79,6 @@ typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
  * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
  * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
  */
-
 typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
 
 namespace internal {
@@ -91,12 +90,8 @@ namespace internal {
  * we however don't want to add a dependency to Boost.
  */
 
-struct true_type {
-  enum { value = 1 };
-};
-struct false_type {
-  enum { value = 0 };
-};
+using std::false_type;
+using std::true_type;
 
 template <bool Condition>
 struct bool_constant;
@@ -221,7 +216,7 @@ struct is_void : is_same<void, std::remove_const_t<T>> {};
  *
  * Post C++17: Uses std::void_t
  */
-#if EIGEN_COMP_CXXVER >= 17
+#if EIGEN_COMP_CXXVER >= 17 && defined(__cpp_lib_void_t) && __cpp_lib_void_t >= 201411L
 using std::void_t;
 #else
 template <typename...>
@@ -339,24 +334,27 @@ struct array_size<std::array<T, N>> {
  *
  * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
  */
-#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_STRICT_LESS_THAN(10, 0, 0)
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L
+
+template <typename T>
+constexpr auto index_list_size(T&& x) {
+  using std::ssize;
+  return ssize(std::forward<T>(x));
+}
+
+#else
+
 template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(const T& x) {
+constexpr auto index_list_size(const T& x) {
   using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
   return static_cast<R>(x.size());
 }
 
 template <typename T, std::ptrdiff_t N>
-EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) {
+constexpr std::ptrdiff_t index_list_size(const T (&)[N]) {
   return N;
 }
-#else
-template <typename T>
-EIGEN_CONSTEXPR auto index_list_size(T&& x) {
-  using std::ssize;
-  return ssize(std::forward<T>(x));
-}
-#endif  // EIGEN_COMP_CXXVER
+#endif
 
 /** \internal
  * Convenient struct to get the result type of a nullary, unary, binary, or
@@ -638,19 +636,23 @@ EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
 template <typename A>
 constexpr bool is_int_or_enum_v = std::is_enum<A>::value || std::is_integral<A>::value;
 
-/// \internal Gets the minimum of two values which may be integers or enums
 template <typename A, typename B>
-inline constexpr int plain_enum_min(A a, B b) {
+constexpr void plain_enum_asserts(A, B) {
   static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
   static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+}
+
+/// \internal Gets the minimum of two values which may be integers or enums
+template <typename A, typename B>
+constexpr int plain_enum_min(A a, B b) {
+  plain_enum_asserts(a, b);
   return ((int)a <= (int)b) ? (int)a : (int)b;
 }
 
 /// \internal Gets the maximum of two values which may be integers or enums
 template <typename A, typename B>
-inline constexpr int plain_enum_max(A a, B b) {
-  static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
-  static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+constexpr int plain_enum_max(A a, B b) {
+  plain_enum_asserts(a, b);
   return ((int)a >= (int)b) ? (int)a : (int)b;
 }
 
@@ -661,9 +663,8 @@ inline constexpr int plain_enum_max(A a, B b) {
  *  finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
  */
 template <typename A, typename B>
-inline constexpr int min_size_prefer_dynamic(A a, B b) {
-  static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
-  static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+constexpr int min_size_prefer_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
   if ((int)a == 0 || (int)b == 0) return 0;
   if ((int)a == 1 || (int)b == 1) return 1;
   if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
@@ -677,9 +678,8 @@ inline constexpr int min_size_prefer_dynamic(A a, B b) {
  * 0 and 3), it is not more than 3.
  */
 template <typename A, typename B>
-inline constexpr int min_size_prefer_fixed(A a, B b) {
-  static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
-  static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+constexpr int min_size_prefer_fixed(A a, B b) {
+  plain_enum_asserts(a, b);
   if ((int)a == 0 || (int)b == 0) return 0;
   if ((int)a == 1 || (int)b == 1) return 1;
   if ((int)a == Dynamic && (int)b == Dynamic) return Dynamic;
@@ -690,26 +690,69 @@ inline constexpr int min_size_prefer_fixed(A a, B b) {
 
 /// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here.
 template <typename A, typename B>
-inline constexpr int max_size_prefer_dynamic(A a, B b) {
-  static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
-  static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+constexpr int max_size_prefer_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
   if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
   return plain_enum_max(a, b);
 }
 
+template <typename A, typename B>
+inline constexpr int size_prefer_fixed(A a, B b) {
+  plain_enum_asserts(a, b);
+  return int(a) == Dynamic ? int(b) : int(a);
+}
+
+template <typename A, typename B>
+inline constexpr bool enum_eq_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a == (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_lt_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a < (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_le_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a <= (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_gt_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a > (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_ge_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a >= (int)b;
+}
+
 /// \internal Calculate logical XOR at compile time
-inline constexpr bool logical_xor(bool a, bool b) { return a != b; }
+constexpr bool logical_xor(bool a, bool b) { return a != b; }
 
 /// \internal Calculate logical IMPLIES at compile time
-inline constexpr bool check_implication(bool a, bool b) { return !a || b; }
+constexpr bool check_implication(bool a, bool b) { return !a || b; }
 
 /// \internal Provide fallback for std::is_constant_evaluated for pre-C++20.
-#if EIGEN_COMP_CXXVER >= 20
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
 using std::is_constant_evaluated;
 #else
 constexpr bool is_constant_evaluated() { return false; }
 #endif
 
+template <typename Scalar>
+using make_complex_t = std::conditional_t<NumTraits<Scalar>::IsComplex, Scalar, std::complex<Scalar>>;
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/util/MoreMeta.h b/Eigen/src/Core/util/MoreMeta.h
index 2d4aeee..6823bca 100644
--- a/Eigen/src/Core/util/MoreMeta.h
+++ b/Eigen/src/Core/util/MoreMeta.h
@@ -40,6 +40,7 @@ struct numeric_list<T, n, nn...> {
   static constexpr T first_value = n;
 };
 
+// Ddoxygen doesn't like the recursive definition of gen_numeric_list.
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 /* numeric list constructors
  *
@@ -53,6 +54,7 @@ struct numeric_list<T, n, nn...> {
 
 template <typename T, std::size_t n, T start = 0, T... ii>
 struct gen_numeric_list : gen_numeric_list<T, n - 1, start, start + n - 1, ii...> {};
+
 template <typename T, T start, T... ii>
 struct gen_numeric_list<T, 0, start, ii...> {
   typedef numeric_list<T, ii...> type;
@@ -80,6 +82,10 @@ template <typename T, T V, T... nn>
 struct gen_numeric_list_repeated<T, 0, V, nn...> {
   typedef numeric_list<T, nn...> type;
 };
+#else
+template <typename T, std::size_t n, T start = 0, T... ii>
+struct gen_numeric_list;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
 /* list manipulation: concatenate */
 
@@ -110,16 +116,20 @@ struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
 
 template <int n, typename x>
 struct take;
+
 template <int n, typename a, typename... as>
 struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n - 1, type_list<as...>>::type> {};
+
 template <int n>
 struct take<n, type_list<>> {
   typedef type_list<> type;
 };
+
 template <typename a, typename... as>
 struct take<0, type_list<a, as...>> {
   typedef type_list<> type;
 };
+
 template <>
 struct take<0, type_list<>> {
   typedef type_list<> type;
@@ -128,13 +138,12 @@ struct take<0, type_list<>> {
 template <typename T, int n, T a, T... as>
 struct take<n, numeric_list<T, a, as...>>
     : concat<numeric_list<T, a>, typename take<n - 1, numeric_list<T, as...>>::type> {};
-// XXX The following breaks in gcc-11, and is invalid anyways.
-// template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type;
-// };
+
 template <typename T, T a, T... as>
 struct take<0, numeric_list<T, a, as...>> {
   typedef numeric_list<T> type;
 };
+
 template <typename T>
 struct take<0, numeric_list<T>> {
   typedef numeric_list<T> type;
@@ -173,7 +182,6 @@ template <>
 struct h_skip_helper_type<0> {
   typedef type_list<> type;
 };
-#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
 template <int n>
 struct h_skip {
diff --git a/Eigen/src/Core/util/ReshapedHelper.h b/Eigen/src/Core/util/ReshapedHelper.h
index e569408..1747950 100644
--- a/Eigen/src/Core/util/ReshapedHelper.h
+++ b/Eigen/src/Core/util/ReshapedHelper.h
@@ -40,7 +40,7 @@ struct get_compiletime_reshape_size<AutoSize_t, OtherSize, TotalSize> {
 
 inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { return total / other; }
 
-constexpr inline int get_compiletime_reshape_order(int flags, int order) {
+constexpr int get_compiletime_reshape_order(int flags, int order) {
   return order == AutoOrder ? flags & RowMajorBit : order;
 }
 
diff --git a/Eigen/src/Core/util/Serializer.h b/Eigen/src/Core/util/Serializer.h
index 1e12820..dc3bd13 100644
--- a/Eigen/src/Core/util/Serializer.h
+++ b/Eigen/src/Core/util/Serializer.h
@@ -28,7 +28,8 @@ class Serializer;
 
 // Specialization for POD types.
 template <typename T>
-class Serializer<T, typename std::enable_if_t<std::is_trivial<T>::value && std::is_standard_layout<T>::value>> {
+class Serializer<T,
+                 typename std::enable_if_t<std::is_trivially_copyable<T>::value && std::is_standard_layout<T>::value>> {
  public:
   /**
    * Determines the required size of the serialization buffer for a value.
diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index 136942c..dc204af 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h
@@ -44,6 +44,8 @@ namespace symbolic {
 
 template <typename Tag>
 class Symbol;
+template <typename Tag, typename Type>
+class SymbolValue;
 template <typename Arg0>
 class NegateExpr;
 template <typename Arg1, typename Arg2>
@@ -52,136 +54,123 @@ template <typename Arg1, typename Arg2>
 class ProductExpr;
 template <typename Arg1, typename Arg2>
 class QuotientExpr;
-
-// A simple wrapper around an integral value to provide the eval method.
-// We could also use a free-function symbolic_eval...
 template <typename IndexType = Index>
-class ValueExpr {
- public:
-  ValueExpr(IndexType val) : m_value(val) {}
-  template <typename T>
-  IndexType eval_impl(const T&) const {
-    return m_value;
-  }
-
- protected:
-  IndexType m_value;
-};
-
-// Specialization for compile-time value,
-// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
-template <int N>
-class ValueExpr<internal::FixedInt<N> > {
- public:
-  ValueExpr() {}
-  template <typename T>
-  EIGEN_CONSTEXPR Index eval_impl(const T&) const {
-    return N;
-  }
-};
+class ValueExpr;
 
 /** \class BaseExpr
  * \ingroup Core_Module
  * Common base class of any symbolic expressions
  */
-template <typename Derived>
+template <typename Derived_>
 class BaseExpr {
  public:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  using Derived = Derived_;
+  constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
   /** Evaluate the expression given the \a values of the symbols.
    *
-   * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue
-   *               as constructed by SymbolExpr::operator= operator.
+   * \param values defines the values of the symbols, as constructed by SymbolExpr::operator= operator.
    *
    */
-  template <typename T>
-  Index eval(const T& values) const {
-    return derived().eval_impl(values);
+  template <typename... Tags, typename... Types>
+  constexpr Index eval(const SymbolValue<Tags, Types>&... values) const {
+    return derived().eval_impl(values...);
   }
 
-  template <typename... Types>
-  Index eval(Types&&... values) const {
-    return derived().eval_impl(std::make_tuple(values...));
+  /** Evaluate the expression at compile time given the \a values of the symbols.
+   *
+   * If a value is not known at compile-time, returns Eigen::Undefined.
+   *
+   */
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time(const SymbolValue<Tags, Types>&...) {
+    return Derived::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
   }
 
-  NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
+  constexpr NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
 
-  AddExpr<Derived, ValueExpr<> > operator+(Index b) const { return AddExpr<Derived, ValueExpr<> >(derived(), b); }
-  AddExpr<Derived, ValueExpr<> > operator-(Index a) const { return AddExpr<Derived, ValueExpr<> >(derived(), -a); }
-  ProductExpr<Derived, ValueExpr<> > operator*(Index a) const {
-    return ProductExpr<Derived, ValueExpr<> >(derived(), a);
+  constexpr AddExpr<Derived, ValueExpr<>> operator+(Index b) const {
+    return AddExpr<Derived, ValueExpr<>>(derived(), b);
+  }
+  constexpr AddExpr<Derived, ValueExpr<>> operator-(Index a) const {
+    return AddExpr<Derived, ValueExpr<>>(derived(), -a);
   }
-  QuotientExpr<Derived, ValueExpr<> > operator/(Index a) const {
-    return QuotientExpr<Derived, ValueExpr<> >(derived(), a);
+  constexpr ProductExpr<Derived, ValueExpr<>> operator*(Index a) const {
+    return ProductExpr<Derived, ValueExpr<>>(derived(), a);
+  }
+  constexpr QuotientExpr<Derived, ValueExpr<>> operator/(Index a) const {
+    return QuotientExpr<Derived, ValueExpr<>>(derived(), a);
   }
 
-  friend AddExpr<Derived, ValueExpr<> > operator+(Index a, const BaseExpr& b) {
-    return AddExpr<Derived, ValueExpr<> >(b.derived(), a);
+  friend constexpr AddExpr<Derived, ValueExpr<>> operator+(Index a, const BaseExpr& b) {
+    return AddExpr<Derived, ValueExpr<>>(b.derived(), a);
   }
-  friend AddExpr<NegateExpr<Derived>, ValueExpr<> > operator-(Index a, const BaseExpr& b) {
-    return AddExpr<NegateExpr<Derived>, ValueExpr<> >(-b.derived(), a);
+  friend constexpr AddExpr<NegateExpr<Derived>, ValueExpr<>> operator-(Index a, const BaseExpr& b) {
+    return AddExpr<NegateExpr<Derived>, ValueExpr<>>(-b.derived(), a);
   }
-  friend ProductExpr<ValueExpr<>, Derived> operator*(Index a, const BaseExpr& b) {
+  friend constexpr ProductExpr<ValueExpr<>, Derived> operator*(Index a, const BaseExpr& b) {
     return ProductExpr<ValueExpr<>, Derived>(a, b.derived());
   }
-  friend QuotientExpr<ValueExpr<>, Derived> operator/(Index a, const BaseExpr& b) {
+  friend constexpr QuotientExpr<ValueExpr<>, Derived> operator/(Index a, const BaseExpr& b) {
     return QuotientExpr<ValueExpr<>, Derived>(a, b.derived());
   }
 
   template <int N>
-  AddExpr<Derived, ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>) const {
-    return AddExpr<Derived, ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >());
+  constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator+(internal::FixedInt<N>) const {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
   }
   template <int N>
-  AddExpr<Derived, ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N>) const {
-    return AddExpr<Derived, ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >());
+  constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<-N>>> operator-(internal::FixedInt<N>) const {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<-N>>>(derived(), ValueExpr<internal::FixedInt<-N>>());
   }
   template <int N>
-  ProductExpr<Derived, ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N>) const {
-    return ProductExpr<Derived, ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >());
+  constexpr ProductExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator*(internal::FixedInt<N>) const {
+    return ProductExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
   }
   template <int N>
-  QuotientExpr<Derived, ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N>) const {
-    return QuotientExpr<Derived, ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >());
+  constexpr QuotientExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator/(internal::FixedInt<N>) const {
+    return QuotientExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
   }
 
   template <int N>
-  friend AddExpr<Derived, ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>, const BaseExpr& b) {
-    return AddExpr<Derived, ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >());
+  friend constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator+(internal::FixedInt<N>,
+                                                                                const BaseExpr& b) {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<N>>>(b.derived(), ValueExpr<internal::FixedInt<N>>());
   }
   template <int N>
-  friend AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N>,
-                                                                                    const BaseExpr& b) {
-    return AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N> > >(-b.derived(),
-                                                                            ValueExpr<internal::FixedInt<N> >());
+  friend constexpr AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N>>> operator-(internal::FixedInt<N>,
+                                                                                            const BaseExpr& b) {
+    return AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N>>>(-b.derived(),
+                                                                          ValueExpr<internal::FixedInt<N>>());
   }
   template <int N>
-  friend ProductExpr<ValueExpr<internal::FixedInt<N> >, Derived> operator*(internal::FixedInt<N>, const BaseExpr& b) {
-    return ProductExpr<ValueExpr<internal::FixedInt<N> >, Derived>(ValueExpr<internal::FixedInt<N> >(), b.derived());
+  friend constexpr ProductExpr<ValueExpr<internal::FixedInt<N>>, Derived> operator*(internal::FixedInt<N>,
+                                                                                    const BaseExpr& b) {
+    return ProductExpr<ValueExpr<internal::FixedInt<N>>, Derived>(ValueExpr<internal::FixedInt<N>>(), b.derived());
   }
   template <int N>
-  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >, Derived> operator/(internal::FixedInt<N>, const BaseExpr& b) {
-    return QuotientExpr<ValueExpr<internal::FixedInt<N> >, Derived>(ValueExpr<internal::FixedInt<N> >(), b.derived());
+  friend constexpr QuotientExpr<ValueExpr<internal::FixedInt<N>>, Derived> operator/(internal::FixedInt<N>,
+                                                                                     const BaseExpr& b) {
+    return QuotientExpr<ValueExpr<internal::FixedInt<N>>, Derived>(ValueExpr<internal::FixedInt<N>>(), b.derived());
   }
 
   template <typename OtherDerived>
-  AddExpr<Derived, OtherDerived> operator+(const BaseExpr<OtherDerived>& b) const {
+  constexpr AddExpr<Derived, OtherDerived> operator+(const BaseExpr<OtherDerived>& b) const {
     return AddExpr<Derived, OtherDerived>(derived(), b.derived());
   }
 
   template <typename OtherDerived>
-  AddExpr<Derived, NegateExpr<OtherDerived> > operator-(const BaseExpr<OtherDerived>& b) const {
-    return AddExpr<Derived, NegateExpr<OtherDerived> >(derived(), -b.derived());
+  constexpr AddExpr<Derived, NegateExpr<OtherDerived>> operator-(const BaseExpr<OtherDerived>& b) const {
+    return AddExpr<Derived, NegateExpr<OtherDerived>>(derived(), -b.derived());
   }
 
   template <typename OtherDerived>
-  ProductExpr<Derived, OtherDerived> operator*(const BaseExpr<OtherDerived>& b) const {
+  constexpr ProductExpr<Derived, OtherDerived> operator*(const BaseExpr<OtherDerived>& b) const {
     return ProductExpr<Derived, OtherDerived>(derived(), b.derived());
   }
 
   template <typename OtherDerived>
-  QuotientExpr<Derived, OtherDerived> operator/(const BaseExpr<OtherDerived>& b) const {
+  constexpr QuotientExpr<Derived, OtherDerived> operator/(const BaseExpr<OtherDerived>& b) const {
     return QuotientExpr<Derived, OtherDerived>(derived(), b.derived());
   }
 };
@@ -190,59 +179,190 @@ template <typename T>
 struct is_symbolic {
   // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class
   // BaseExpr<T>.
-  enum { value = internal::is_convertible<T, BaseExpr<T> >::value };
+  enum { value = internal::is_convertible<T, BaseExpr<T>>::value };
+};
+
+// A simple wrapper around an integral value to provide the eval method.
+// We could also use a free-function symbolic_eval...
+template <typename IndexType>
+class ValueExpr : BaseExpr<ValueExpr<IndexType>> {
+ public:
+  constexpr ValueExpr() = default;
+  constexpr ValueExpr(IndexType val) : value_(val) {}
+  template <typename... Tags, typename... Types>
+  constexpr IndexType eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value_;
+  }
+  template <typename... Tags, typename... Types>
+  static constexpr IndexType eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return IndexType(Undefined);
+  }
+
+ protected:
+  IndexType value_;
+};
+
+// Specialization for compile-time value,
+// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
+template <int N>
+class ValueExpr<internal::FixedInt<N>> : public BaseExpr<ValueExpr<internal::FixedInt<N>>> {
+ public:
+  constexpr ValueExpr() = default;
+  constexpr ValueExpr(internal::FixedInt<N>) {}
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return Index(N);
+  }
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return Index(N);
+  }
 };
 
 /** Represents the actual value of a symbol identified by its tag
  *
  * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
  */
+template <typename Tag, typename Type>
+class SymbolValue : public BaseExpr<SymbolValue<Tag, Type>> {};
+
 template <typename Tag>
-class SymbolValue {
+class SymbolValue<Tag, Index> : public BaseExpr<SymbolValue<Tag, Index>> {
  public:
+  constexpr SymbolValue() = default;
+
   /** Default constructor from the value \a val */
-  SymbolValue(Index val) : m_value(val) {}
+  constexpr SymbolValue(Index val) : value_(val) {}
 
   /** \returns the stored value of the symbol */
-  Index value() const { return m_value; }
+  constexpr Index value() const { return value_; }
+
+  /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
+  static constexpr Index value_at_compile_time() { return Index(Undefined); }
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value();
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return value_at_compile_time();
+  }
 
  protected:
-  Index m_value;
+  Index value_;
+};
+
+template <typename Tag, int N>
+class SymbolValue<Tag, internal::FixedInt<N>> : public BaseExpr<SymbolValue<Tag, internal::FixedInt<N>>> {
+ public:
+  constexpr SymbolValue() = default;
+
+  /** Default constructor from the value \a val */
+  constexpr SymbolValue(internal::FixedInt<N>) {}
+
+  /** \returns the stored value of the symbol */
+  constexpr Index value() const { return static_cast<Index>(N); }
+
+  /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
+  static constexpr Index value_at_compile_time() { return static_cast<Index>(N); }
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value();
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return value_at_compile_time();
+  }
+};
+
+// Find and return a symbol value based on the tag.
+template <typename Tag, typename... Types>
+struct EvalSymbolValueHelper;
+
+// Empty base case, symbol not found.
+template <typename Tag>
+struct EvalSymbolValueHelper<Tag> {
+  static constexpr Index eval_impl() {
+    eigen_assert(false && "Symbol not found.");
+    return Index(Undefined);
+  }
+  static constexpr Index eval_at_compile_time_impl() { return Index(Undefined); }
+};
+
+// We found a symbol value matching the provided Tag!
+template <typename Tag, typename Type, typename... OtherTypes>
+struct EvalSymbolValueHelper<Tag, SymbolValue<Tag, Type>, OtherTypes...> {
+  static constexpr Index eval_impl(const SymbolValue<Tag, Type>& symbol, const OtherTypes&...) {
+    return symbol.value();
+  }
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tag, Type>& symbol, const OtherTypes&...) {
+    return symbol.value_at_compile_time();
+  }
+};
+
+// No symbol value in first value, recursive search starting with next.
+template <typename Tag, typename T1, typename... OtherTypes>
+struct EvalSymbolValueHelper<Tag, T1, OtherTypes...> {
+  static constexpr Index eval_impl(const T1&, const OtherTypes&... values) {
+    return EvalSymbolValueHelper<Tag, OtherTypes...>::eval_impl(values...);
+  }
+  static constexpr Index eval_at_compile_time_impl(const T1&, const OtherTypes&...) {
+    return EvalSymbolValueHelper<Tag, OtherTypes...>::eval_at_compile_time_impl(OtherTypes{}...);
+  }
 };
 
 /** Expression of a symbol uniquely identified by the template parameter type \c tag */
 template <typename tag>
-class SymbolExpr : public BaseExpr<SymbolExpr<tag> > {
+class SymbolExpr : public BaseExpr<SymbolExpr<tag>> {
  public:
   /** Alias to the template parameter \c tag */
   typedef tag Tag;
 
-  SymbolExpr() {}
+  constexpr SymbolExpr() = default;
 
   /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag.
    *
    * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified
    * runtime-time value.
    */
-  SymbolValue<Tag> operator=(Index val) const { return SymbolValue<Tag>(val); }
+  constexpr SymbolValue<Tag, Index> operator=(Index val) const { return SymbolValue<Tag, Index>(val); }
+
+  template <int N>
+  constexpr SymbolValue<Tag, internal::FixedInt<N>> operator=(internal::FixedInt<N>) const {
+    return SymbolValue<Tag, internal::FixedInt<N>>{internal::FixedInt<N>{}};
+  }
 
-  Index eval_impl(const SymbolValue<Tag>& values) const { return values.value(); }
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return EvalSymbolValueHelper<Tag, SymbolValue<Tags, Types>...>::eval_impl(values...);
+  }
 
-  // C++14 versions suitable for multiple symbols
-  template <typename... Types>
-  Index eval_impl(const std::tuple<Types...>& values) const {
-    return std::get<SymbolValue<Tag> >(values).value();
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return EvalSymbolValueHelper<Tag, SymbolValue<Tags, Types>...>::eval_at_compile_time_impl(
+        SymbolValue<Tags, Types>{}...);
   }
 };
 
 template <typename Arg0>
-class NegateExpr : public BaseExpr<NegateExpr<Arg0> > {
+class NegateExpr : public BaseExpr<NegateExpr<Arg0>> {
  public:
-  NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
+  constexpr NegateExpr() = default;
+  constexpr NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
 
-  template <typename T>
-  Index eval_impl(const T& values) const {
-    return -m_arg0.eval_impl(values);
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return -m_arg0.eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v == Undefined) ? Undefined : -v;
   }
 
  protected:
@@ -250,13 +370,21 @@ class NegateExpr : public BaseExpr<NegateExpr<Arg0> > {
 };
 
 template <typename Arg0, typename Arg1>
-class AddExpr : public BaseExpr<AddExpr<Arg0, Arg1> > {
+class AddExpr : public BaseExpr<AddExpr<Arg0, Arg1>> {
  public:
-  AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+  constexpr AddExpr() = default;
+  constexpr AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) + m_arg1.eval_impl(values...);
+  }
 
-  template <typename T>
-  Index eval_impl(const T& values) const {
-    return m_arg0.eval_impl(values) + m_arg1.eval_impl(values);
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 + v1;
   }
 
  protected:
@@ -265,13 +393,21 @@ class AddExpr : public BaseExpr<AddExpr<Arg0, Arg1> > {
 };
 
 template <typename Arg0, typename Arg1>
-class ProductExpr : public BaseExpr<ProductExpr<Arg0, Arg1> > {
+class ProductExpr : public BaseExpr<ProductExpr<Arg0, Arg1>> {
  public:
-  ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+  constexpr ProductExpr() = default;
+  constexpr ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) * m_arg1.eval_impl(values...);
+  }
 
-  template <typename T>
-  Index eval_impl(const T& values) const {
-    return m_arg0.eval_impl(values) * m_arg1.eval_impl(values);
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 * v1;
   }
 
  protected:
@@ -280,13 +416,21 @@ class ProductExpr : public BaseExpr<ProductExpr<Arg0, Arg1> > {
 };
 
 template <typename Arg0, typename Arg1>
-class QuotientExpr : public BaseExpr<QuotientExpr<Arg0, Arg1> > {
+class QuotientExpr : public BaseExpr<QuotientExpr<Arg0, Arg1>> {
  public:
-  QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+  constexpr QuotientExpr() = default;
+  constexpr QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) / m_arg1.eval_impl(values...);
+  }
 
-  template <typename T>
-  Index eval_impl(const T& values) const {
-    return m_arg0.eval_impl(values) / m_arg1.eval_impl(values);
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 / v1;
   }
 
  protected:
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 555faa1..a0e160e 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -158,8 +158,8 @@ class variable_if_dynamic {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR operator T() const { return T(Value); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
@@ -171,7 +171,7 @@ class variable_if_dynamic<T, Dynamic> {
   T m_value;
 
  public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
@@ -186,7 +186,7 @@ class variable_if_dynamicindex {
     EIGEN_ONLY_USED_FOR_DEBUG(v);
     eigen_assert(v == T(Value));
   }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
@@ -206,6 +206,64 @@ struct functor_traits {
   enum { Cost = 10, PacketAccess = false, IsRepeatable = false };
 };
 
+// estimates the cost of lazily evaluating a generic functor by unwinding the expression
+template <typename Xpr>
+struct nested_functor_cost {
+  static constexpr Index Cost = static_cast<Index>(functor_traits<Xpr>::Cost);
+};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct nested_functor_cost<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
+  static constexpr Index Cost = 1;
+};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct nested_functor_cost<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
+  static constexpr Index Cost = 1;
+};
+
+// TODO: assign a cost to the stride type?
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct nested_functor_cost<Map<PlainObjectType, MapOptions, StrideType>> : nested_functor_cost<PlainObjectType> {};
+
+template <typename Func, typename Xpr>
+struct nested_functor_cost<CwiseUnaryOp<Func, Xpr>> {
+  using XprCleaned = remove_all_t<Xpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
+};
+
+template <typename Func, typename Xpr>
+struct nested_functor_cost<CwiseNullaryOp<Func, Xpr>> {
+  using XprCleaned = remove_all_t<Xpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
+};
+
+template <typename Func, typename LhsXpr, typename RhsXpr>
+struct nested_functor_cost<CwiseBinaryOp<Func, LhsXpr, RhsXpr>> {
+  using LhsXprCleaned = remove_all_t<LhsXpr>;
+  using RhsXprCleaned = remove_all_t<RhsXpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
+                                nested_functor_cost<RhsXprCleaned>::Cost;
+};
+
+template <typename Func, typename LhsXpr, typename MidXpr, typename RhsXpr>
+struct nested_functor_cost<CwiseTernaryOp<Func, LhsXpr, MidXpr, RhsXpr>> {
+  using LhsXprCleaned = remove_all_t<LhsXpr>;
+  using MidXprCleaned = remove_all_t<MidXpr>;
+  using RhsXprCleaned = remove_all_t<RhsXpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
+                                nested_functor_cost<MidXprCleaned>::Cost + nested_functor_cost<RhsXprCleaned>::Cost;
+};
+
+template <typename Xpr>
+struct functor_cost {
+  static constexpr Index Cost = plain_enum_max(nested_functor_cost<Xpr>::Cost, 1);
+};
+
 template <typename T>
 struct packet_traits;
 
@@ -257,7 +315,7 @@ struct find_packet_by_size<T, 1> {
 };
 
 #if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
   if ((ArrayBytes % AlignmentBytes) == 0) {
     return AlignmentBytes;
   } else if (EIGEN_MIN_ALIGN_BYTES < AlignmentBytes) {
@@ -269,7 +327,7 @@ constexpr inline int compute_default_alignment_helper(int ArrayBytes, int Alignm
 #else
 // If static alignment is disabled, no need to bother.
 // This also avoids a division by zero
-constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
   EIGEN_UNUSED_VARIABLE(ArrayBytes);
   EIGEN_UNUSED_VARIABLE(AlignmentBytes);
   return 0;
@@ -304,7 +362,7 @@ class make_proper_matrix_type {
   typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
 };
 
-constexpr inline unsigned compute_matrix_flags(int Options) {
+constexpr unsigned compute_matrix_flags(int Options) {
   unsigned row_major_bit = Options & RowMajor ? RowMajorBit : 0;
   // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
   // and then propagate this information to the evaluator's flags.
@@ -312,7 +370,7 @@ constexpr inline unsigned compute_matrix_flags(int Options) {
   return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit;
 }
 
-constexpr inline int size_at_compile_time(int rows, int cols) {
+constexpr int size_at_compile_time(int rows, int cols) {
   if (rows == 0 || cols == 0) return 0;
   if (rows == Dynamic || cols == Dynamic) return Dynamic;
   return rows * cols;
@@ -827,8 +885,12 @@ struct scalar_div_cost {
 };
 
 template <typename T, bool Vectorized>
-struct scalar_div_cost<std::complex<T>, Vectorized> {
-  enum { value = 2 * scalar_div_cost<T>::value + 6 * NumTraits<T>::MulCost + 3 * NumTraits<T>::AddCost };
+struct scalar_div_cost<T, Vectorized, std::enable_if_t<NumTraits<T>::IsComplex>> {
+  using RealScalar = typename NumTraits<T>::Real;
+  enum {
+    value =
+        2 * scalar_div_cost<RealScalar>::value + 6 * NumTraits<RealScalar>::MulCost + 3 * NumTraits<RealScalar>::AddCost
+  };
 };
 
 template <bool Vectorized>
@@ -928,6 +990,12 @@ template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_xpr_helper<const Block<XprType, BlockRows, BlockCols, InnerPanel>>
     : block_xpr_helper<Block<XprType, BlockRows, BlockCols, InnerPanel>> {};
 
+template <typename XprType>
+struct is_matrix_base_xpr : std::is_base_of<MatrixBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
+
+template <typename XprType>
+struct is_permutation_base_xpr : std::is_base_of<PermutationBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
+
 }  // end namespace internal
 
 /** \class ScalarBinaryOpTraits
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 60a24a8..50fa3b8 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -70,7 +70,7 @@ class ComplexEigenSolver {
    * \c float or \c double) and just \c Scalar if #Scalar is
    * complex.
    */
-  typedef std::complex<RealScalar> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
 
   /** \brief Type for vector of eigenvalues as returned by eigenvalues().
    *
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index a33e46e..22433f2 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -75,7 +75,7 @@ class ComplexSchur {
    * \c float or \c double) and just \c Scalar if #Scalar is
    * complex.
    */
-  typedef std::complex<RealScalar> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
 
   /** \brief Type for the matrices in the Schur decomposition.
    *
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 40830fb..9dba7bd 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -89,7 +89,7 @@ class EigenSolver {
    * \c float or \c double) and just \c Scalar if #Scalar is
    * complex.
    */
-  typedef std::complex<RealScalar> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
 
   /** \brief Type for vector of eigenvalues as returned by eigenvalues().
    *
@@ -319,17 +319,24 @@ template <typename MatrixType>
 MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
   const RealScalar precision = RealScalar(2) * NumTraits<RealScalar>::epsilon();
-  Index n = m_eivalues.rows();
+  const Index n = m_eivalues.rows();
   MatrixType matD = MatrixType::Zero(n, n);
-  for (Index i = 0; i < n; ++i) {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i)), precision))
-      matD.coeffRef(i, i) = numext::real(m_eivalues.coeff(i));
-    else {
-      matD.template block<2, 2>(i, i) << numext::real(m_eivalues.coeff(i)), numext::imag(m_eivalues.coeff(i)),
-          -numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i));
+  Index i = 0;
+  for (; i < n - 1; ++i) {
+    RealScalar real = numext::real(m_eivalues.coeff(i));
+    RealScalar imag = numext::imag(m_eivalues.coeff(i));
+    matD.coeffRef(i, i) = real;
+    if (!internal::isMuchSmallerThan(imag, real, precision)) {
+      matD.coeffRef(i, i + 1) = imag;
+      matD.coeffRef(i + 1, i) = -imag;
+      matD.coeffRef(i + 1, i + 1) = real;
       ++i;
     }
   }
+  if (i == n - 1) {
+    matD.coeffRef(i, i) = numext::real(m_eivalues.coeff(i));
+  }
+
   return matD;
 }
 
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 08f1e34..c0a61dc 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -83,7 +83,7 @@ class GeneralizedEigenSolver {
    * \c float or \c double) and just \c Scalar if #Scalar is
    * complex.
    */
-  typedef std::complex<RealScalar> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
 
   /** \brief Type for vector of real scalar values eigenvalues as returned by betas().
    *
@@ -161,7 +161,7 @@ class GeneralizedEigenSolver {
     compute(A, B, computeEigenvectors);
   }
 
-  /* \brief Returns the computed generalized eigenvectors.
+  /** \brief Returns the computed generalized eigenvectors.
    *
    * \returns  %Matrix whose columns are the (possibly complex) right eigenvectors.
    * i.e. the eigenvectors that solve (A - l*B)x = 0. The ordering matches the eigenvalues.
@@ -361,7 +361,7 @@ GeneralizedEigenSolver<MatrixType>& GeneralizedEigenSolver<MatrixType>::compute(
           // Compute eigenvector in position (i+1) and then position (i) is just the conjugate
           cv.setZero();
           cv.coeffRef(i + 1) = Scalar(1.0);
-          // here, the "static_cast" workaound expression template issues.
+          // here, the "static_cast" workaround expression template issues.
           cv.coeffRef(i) = -(static_cast<Scalar>(beta * mS.coeffRef(i, i + 1)) - alpha * mT.coeffRef(i, i + 1)) /
                            (static_cast<Scalar>(beta * mS.coeffRef(i, i)) - alpha * mT.coeffRef(i, i));
           for (Index j = i - 1; j >= 0; j--) {
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index 3466f51..a54d82d 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -69,7 +69,7 @@ class RealQZ {
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
   typedef typename MatrixType::Scalar Scalar;
-  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
   typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
 
   typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
@@ -449,23 +449,23 @@ inline void RealQZ<MatrixType>::step(Index f, Index l, Index iter) {
       Index lr = (std::min)(k + 4, dim);  // last row to update
       Map<Matrix<Scalar, Dynamic, 1> > tmp(m_workspace.data(), lr);
       // S
-      tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2;
+      tmp.noalias() = m_S.template middleCols<2>(k).topRows(lr) * essential2;
       tmp += m_S.col(k + 2).head(lr);
       m_S.col(k + 2).head(lr) -= tau * tmp;
-      m_S.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+      m_S.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
       // T
       tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
       tmp += m_T.col(k + 2).head(lr);
       m_T.col(k + 2).head(lr) -= tau * tmp;
-      m_T.template middleCols<2>(k).topRows(lr) -= (tau * tmp) * essential2.adjoint();
+      m_T.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
     }
     if (m_computeQZ) {
       // Z
       Map<Matrix<Scalar, 1, Dynamic> > tmp(m_workspace.data(), dim);
-      tmp = essential2.adjoint() * (m_Z.template middleRows<2>(k));
+      tmp.noalias() = essential2.adjoint() * (m_Z.template middleRows<2>(k));
       tmp += m_Z.row(k + 2);
       m_Z.row(k + 2) -= tau * tmp;
-      m_Z.template middleRows<2>(k) -= essential2 * (tau * tmp);
+      m_Z.template middleRows<2>(k).noalias() -= essential2 * (tau * tmp);
     }
     m_T.coeffRef(k + 2, k) = m_T.coeffRef(k + 2, k + 1) = Scalar(0.0);
 
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 5cef658..94bc34d 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -66,7 +66,7 @@ class RealSchur {
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
   typedef typename MatrixType::Scalar Scalar;
-  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
   typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
 
   typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
@@ -409,7 +409,7 @@ inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& ex
   shiftInfo.coeffRef(2) = m_matT.coeff(iu, iu - 1) * m_matT.coeff(iu - 1, iu);
 
   // Alternate exceptional shifting strategy every 16 iterations.
-  if (iter % 16 == 0) {
+  if (iter > 0 && iter % 16 == 0) {
     // Wilkinson's original ad hoc shift
     if (iter % 32 != 0) {
       exshift += shiftInfo.coeff(0);
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 9bb791d..9cc9201 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -345,7 +345,7 @@ EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorT
 
     // Apply similarity transformation to remaining columns,
     // i.e., A = H A H' where H = I - h v v' and v = matA.col(i).tail(n-i-1)
-    matA.col(i).coeffRef(i + 1) = 1;
+    matA.col(i).coeffRef(i + 1) = Scalar(1);
 
     hCoeffs.tail(n - i - 1).noalias() =
         (matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() *
@@ -379,6 +379,8 @@ struct tridiagonalization_inplace_selector;
  *    decomposition.
  * \param[out]  subdiag  The subdiagonal of the tridiagonal matrix T in
  *    the decomposition.
+ * \param[out]  hcoeffs
+ * \param[out]  workspace
  * \param[in]  extractQ  If true, the orthogonal matrix Q in the
  *    decomposition is computed and stored in \p mat.
  *
@@ -445,8 +447,8 @@ struct tridiagonalization_inplace_selector<MatrixType, 3, false> {
   typedef typename MatrixType::RealScalar RealScalar;
 
   template <typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType, typename WorkSpaceType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, WorkSpaceType&,
-                  bool extractQ) {
+  static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&,
+                                    WorkSpaceType&, bool extractQ) {
     using std::sqrt;
     const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
     diag[0] = mat(0, 0);
@@ -513,8 +515,8 @@ struct TridiagonalizationMatrixTReturnType : public ReturnByValue<Tridiagonaliza
     result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
   }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
  protected:
   typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index ad6b821..366a32c 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@@ -133,8 +133,8 @@ EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> Matr
  * \sa class AngleAxis
  */
 template <typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>
-MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const {
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> MatrixBase<Derived>::eulerAngles(
+    Index a0, Index a1, Index a2) const {
   /* Implemented from Graphics Gems IV */
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
 
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 64c1b65..4159dc6 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -69,10 +69,10 @@ class Homogeneous : public MatrixBase<Homogeneous<MatrixType, Direction_> >, int
 
   EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix) : m_matrix(matrix) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return m_matrix.rows() + (int(Direction) == Vertical ? 1 : 0);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
     return m_matrix.cols() + (int(Direction) == Horizontal ? 1 : 0);
   }
 
@@ -80,14 +80,12 @@ class Homogeneous : public MatrixBase<Homogeneous<MatrixType, Direction_> >, int
 
   template <typename Rhs>
   EIGEN_DEVICE_FUNC inline const Product<Homogeneous, Rhs> operator*(const MatrixBase<Rhs>& rhs) const {
-    eigen_assert(int(Direction) == Horizontal);
     return Product<Homogeneous, Rhs>(*this, rhs.derived());
   }
 
   template <typename Lhs>
   friend EIGEN_DEVICE_FUNC inline const Product<Lhs, Homogeneous> operator*(const MatrixBase<Lhs>& lhs,
                                                                             const Homogeneous& rhs) {
-    eigen_assert(int(Direction) == Vertical);
     return Product<Lhs, Homogeneous>(lhs.derived(), rhs);
   }
 
@@ -244,8 +242,8 @@ struct homogeneous_left_product_impl<Homogeneous<MatrixType, Vertical>, Lhs>
   EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
       : m_lhs(take_matrix_for_product<Lhs>::run(lhs)), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
@@ -275,8 +273,8 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType, Horizontal>, Rhs>
   typedef remove_all_t<typename Rhs::Nested> RhsNested;
   EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   template <typename Dest>
   EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index a8e0502..fc708ee 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -78,27 +78,25 @@ struct cross_impl<Derived, OtherDerived, 2> {
  * spanned by the two vectors.
  *
  * \note With complex numbers, the cross product is implemented as
- * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times
- * \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} + \mathbf{b} \times \mathbf{c})\f$
+ * \f[ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times
+ * \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} + \mathbf{b} \times \mathbf{c}).\f]
+ * This definition preserves the orthogonality condition that \f$\mathbf{u} \cdot (\mathbf{u} \times \mathbf{v}) =
+ * \mathbf{v} \cdot (\mathbf{u} \times \mathbf{v}) = 0\f$.
  *
  * \sa MatrixBase::cross3()
  */
 template <typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    typename internal::cross_impl<Derived, OtherDerived>::return_type
-#else
-    inline std::conditional_t<SizeAtCompileTime == 2, Scalar, PlainObject>
-#endif
-    MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::cross_impl<Derived, OtherDerived>::return_type
+MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const {
   return internal::cross_impl<Derived, OtherDerived>::run(*this, other);
 }
 
 namespace internal {
 
 template <int Arch, typename VectorLhs, typename VectorRhs, typename Scalar = typename VectorLhs::Scalar,
-          bool Vectorizable = bool((evaluator<VectorLhs>::Flags & evaluator<VectorRhs>::Flags) & PacketAccessBit)>
+          bool Vectorizable =
+              bool((int(evaluator<VectorLhs>::Flags) & int(evaluator<VectorRhs>::Flags)) & PacketAccessBit)>
 struct cross3_impl {
   EIGEN_DEVICE_FUNC static inline typename internal::plain_matrix_type<VectorLhs>::type run(const VectorLhs& lhs,
                                                                                             const VectorRhs& rhs) {
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 8931c4a..f2d2d05 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -57,22 +57,22 @@ class QuaternionBase : public RotationBase<Derived, 3> {
   typedef AngleAxis<Scalar> AngleAxisType;
 
   /** \returns the \c x coefficient */
-  EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
 
   /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
   /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
   /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
   /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
-  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
   EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients, 3> vec() const { return coeffs().template head<3>(); }
@@ -85,6 +85,29 @@ class QuaternionBase : public RotationBase<Derived, 3> {
     return derived().coeffs();
   }
 
+  /** \returns a vector containing the coefficients, rearranged into the order [\c w, \c x, \c y, \c z].
+   *
+   * This is the order expected by the \code Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar&
+   * z) \endcode constructor, but not the order of the internal vector representation. Therefore, it returns a newly
+   * constructed vector.
+   *
+   * \sa QuaternionBase::coeffsScalarLast()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarFirst() const {
+    return derived().coeffsScalarFirst();
+  }
+
+  /** \returns a vector containing the coefficients in their original order [\c x, \c y, \c z, \c w].
+   *
+   * This is equivalent to \code coeffs() \endcode, but returns a newly constructed vector for uniformity with \code
+   * coeffsScalarFirst() \endcode.
+   *
+   * \sa QuaternionBase::coeffsScalarFirst()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarLast() const {
+    return derived().coeffsScalarLast();
+  }
+
   /** \returns a vector expression of the coefficients (x,y,z,w) */
   EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
 
@@ -346,25 +369,34 @@ class Quaternion : public QuaternionBase<Quaternion<Scalar_, Options_> > {
 
   // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
   /** Default move constructor */
-  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) noexcept(std::is_nothrow_move_constructible<Scalar>::value)
       : m_coeffs(std::move(other.coeffs())) {}
 
   /** Default move assignment operator */
-  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other)
-      EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value) {
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
     m_coeffs = std::move(other.coeffs());
     return *this;
   }
 
   EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
 
+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarLast(const Scalar& x, const Scalar& y, const Scalar& z,
+                                                           const Scalar& w);
+
+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarFirst(const Scalar& w, const Scalar& x, const Scalar& y,
+                                                            const Scalar& z);
+
   template <typename Derived1, typename Derived2>
   EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
   EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
   EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))
 
 #ifdef EIGEN_QUATERNION_PLUGIN
@@ -439,6 +471,12 @@ class Map<const Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<const
 
   EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
  protected:
   const Coefficients m_coeffs;
 };
@@ -475,6 +513,12 @@ class Map<Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<Quaternion<
   EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
   EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
  protected:
   Coefficients m_coeffs;
 };
@@ -696,6 +740,35 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::UnitR
   return Quaternion(a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }
 
+/** Constructs a quaternion from its coefficients in the order [\c x, \c y, \c z, \c w], i.e. vector part [\c x, \c y,
+ * \c z] first, scalar part \a w LAST.
+ *
+ * This factory accepts the parameters in the same order as the underlying coefficient vector. Consider using this
+ * factory function to make the parameter ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarLast(const Scalar& x,
+                                                                                                const Scalar& y,
+                                                                                                const Scalar& z,
+                                                                                                const Scalar& w) {
+  return Quaternion(w, x, y, z);
+}
+
+/** Constructs a quaternion from its coefficients in the order [\c w, \c x, \c y, \c z], i.e. scalar part \a w FIRST,
+ * vector part [\c x, \c y, \c z] last.
+ *
+ * This factory accepts the parameters in the same order as the constructor \code Quaternion(const Scalar& w, const
+ * Scalar& x, const Scalar& y, const Scalar& z) \endcode. Consider using this factory function to make the parameter
+ * ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarFirst(const Scalar& w,
+                                                                                                 const Scalar& x,
+                                                                                                 const Scalar& y,
+                                                                                                 const Scalar& z) {
+  return Quaternion(w, x, y, z);
+}
+
 /** Returns a quaternion representing a rotation between
  * the two arbitrary vectors \a a and \a b. In other words, the built
  * rotation represent a rotation sending the line of direction \a a
@@ -793,7 +866,7 @@ EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar> Quatern
   } else {
     // theta is the angle between the 2 quaternions
     Scalar theta = acos(absD);
-    Scalar sinTheta = sin(theta);
+    Scalar sinTheta = numext::sqrt(Scalar(1) - absD * absD);
 
     scale0 = sin((Scalar(1) - t) * theta) / sinTheta;
     scale1 = sin((t * theta)) / sinTheta;
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index eb94b52..5918025 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -60,7 +60,7 @@ class Rotation2D : public RotationBase<Rotation2D<Scalar_>, 2> {
   /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
   EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
 
-  /** Default constructor wihtout initialization. The represented rotation is undefined. */
+  /** Default constructor without initialization. The represented rotation is undefined. */
   EIGEN_DEVICE_FUNC Rotation2D() {}
 
   /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index fd3fc58..a5d7b60 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -353,10 +353,10 @@ class Transform {
   inline QTransform toQTransform(void) const;
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return int(Mode) == int(Projective) ? m_matrix.cols() : (m_matrix.cols() - 1);
   }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
    * \sa MatrixBase::operator(Index,Index) const */
@@ -588,9 +588,9 @@ class Transform {
   EIGEN_DEVICE_FUNC inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
 
   /** \returns a const pointer to the column major internal matrix */
-  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_matrix.data(); }
   /** \returns a non-const pointer to the column major internal matrix */
-  EIGEN_DEVICE_FUNC Scalar* data() { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return m_matrix.data(); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
    *
@@ -1059,11 +1059,11 @@ EIGEN_DEVICE_FUNC void Transform<Scalar, Dim, Mode, Options>::computeRotationSca
                  : Scalar(1);  // so x has absolute value 1
   VectorType sv(svd.singularValues());
   sv.coeffRef(Dim - 1) *= x;
-  if (scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+  if (scaling) (*scaling).noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
   if (rotation) {
     LinearMatrixType m(svd.matrixU());
     m.col(Dim - 1) *= x;
-    *rotation = m * svd.matrixV().adjoint();
+    (*rotation).noalias() = m * svd.matrixV().adjoint();
   }
 }
 
@@ -1182,7 +1182,8 @@ EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options> Transform<Scalar, Dim, M
       eigen_assert(false && "Invalid transform traits in Transform::Inverse");
     }
     // translation and remaining parts
-    res.matrix().template topRightCorner<Dim, 1>() = -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
+    res.matrix().template topRightCorner<Dim, 1>().noalias() =
+        -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
     res.makeAffine();  // we do need this, because in the beginning res is uninitialized
   }
   return res;
@@ -1432,7 +1433,7 @@ struct transform_transform_product_impl<Transform<Scalar, Dim, LhsMode, LhsOptio
   typedef Transform<Scalar, Dim, ResultMode, LhsOptions> ResultType;
   static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
     ResultType res;
-    res.linear() = lhs.linear() * rhs.linear();
+    res.linear().noalias() = lhs.linear() * rhs.linear();
     res.translation() = lhs.linear() * rhs.translation() + lhs.translation();
     res.makeAffine();
     return res;
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 956ef56..d942ac8 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -69,18 +69,18 @@ class Translation {
   EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
   /** \brief Returns the x-translation by value. **/
-  EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar x() const { return m_coeffs.x(); }
   /** \brief Returns the y-translation by value. **/
-  EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar y() const { return m_coeffs.y(); }
   /** \brief Returns the z-translation by value. **/
-  EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar z() const { return m_coeffs.z(); }
 
   /** \brief Returns the x-translation as a reference. **/
-  EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& x() { return m_coeffs.x(); }
   /** \brief Returns the y-translation as a reference. **/
-  EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& y() { return m_coeffs.y(); }
   /** \brief Returns the z-translation as a reference. **/
-  EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar& z() { return m_coeffs.z(); }
 
   EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
   EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index f8138b9..8ed6344 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -21,8 +21,6 @@
 
 namespace Eigen {
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
 // These helpers are required since it allows to use mixed types as parameters
 // for the Umeyama. The problem with mixed parameters is that the return type
 // cannot trivially be deduced when float and double types are mixed.
@@ -50,8 +48,6 @@ struct umeyama_transform_matrix_type {
 
 }  // namespace internal
 
-#endif
-
 /**
  * \geometry_module \ingroup Geometry_Module
  *
diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h
index ce3cfea..5601a47 100644
--- a/Eigen/src/Geometry/arch/Geometry_SIMD.h
+++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h
@@ -62,16 +62,21 @@ struct quat_conj<Architecture::Target, Derived, float> {
 
 template <typename VectorLhs, typename VectorRhs>
 struct cross3_impl<Architecture::Target, VectorLhs, VectorRhs, float, true> {
-  enum { ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment };
-  static inline typename plain_matrix_type<VectorLhs>::type run(const VectorLhs& lhs, const VectorRhs& rhs) {
+  using DstPlainType = typename plain_matrix_type<VectorLhs>::type;
+  static constexpr int DstAlignment = evaluator<DstPlainType>::Alignment;
+  static constexpr int LhsAlignment = evaluator<VectorLhs>::Alignment;
+  static constexpr int RhsAlignment = evaluator<VectorRhs>::Alignment;
+  static inline DstPlainType run(const VectorLhs& lhs, const VectorRhs& rhs) {
     evaluator<VectorLhs> lhs_eval(lhs);
     evaluator<VectorRhs> rhs_eval(rhs);
-    Packet4f a = lhs_eval.template packet<traits<VectorLhs>::Alignment, Packet4f>(0);
-    Packet4f b = rhs_eval.template packet<traits<VectorRhs>::Alignment, Packet4f>(0);
+    Packet4f a = lhs_eval.template packet<LhsAlignment, Packet4f>(0);
+    Packet4f b = rhs_eval.template packet<RhsAlignment, Packet4f>(0);
     Packet4f mul1 = pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
     Packet4f mul2 = pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float, Packet4f, ResAlignment>(&res.x(), psub(mul1, mul2));
+    DstPlainType res;
+    pstoret<float, Packet4f, DstAlignment>(res.data(), psub(mul1, mul2));
+    // Ensure last component is 0 in case original a or b contain inf/nan.
+    res[3] = 0.0f;
     return res;
   }
 };
diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 1d6cc1c..8b92304 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -35,7 +35,7 @@ namespace internal {
 //     // Warning, note that hCoeffs may alias with vectors.
 //     // It is then necessary to copy it before modifying vectors(i,i).
 //     typename CoeffsType::Scalar h = hCoeffs(i);
-//     // This hack permits to pass trough nested Block<> and Transpose<> expressions.
+//     // This hack permits to pass through nested Block<> and Transpose<> expressions.
 //     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
 //     Scalar Vii = *Vii_ptr;
 //     *Vii_ptr = Scalar(1);
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 024c4a4..d49c961 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -183,7 +183,7 @@ class HouseholderSequence : public EigenBase<HouseholderSequence<VectorsType, Co
    * \returns Number of rows
    * \details This equals the dimension of the space that the transformation acts on.
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
     return Side == OnTheLeft ? m_vectors.rows() : m_vectors.cols();
   }
 
@@ -191,7 +191,7 @@ class HouseholderSequence : public EigenBase<HouseholderSequence<VectorsType, Co
    * \returns Number of columns
    * \details This equals the dimension of the space that the transformation acts on.
    */
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return rows(); }
 
   /** \brief Essential part of a Householder vector.
    * \param[in]  k  Index of Householder reflection
@@ -476,7 +476,8 @@ typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar, O
   return res;
 }
 
-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
+ * \householder_module
  * \brief Convenience function for constructing a Householder sequence.
  * \returns A HouseholderSequence constructed from the specified arguments.
  */
@@ -485,7 +486,8 @@ HouseholderSequence<VectorsType, CoeffsType> householderSequence(const VectorsTy
   return HouseholderSequence<VectorsType, CoeffsType, OnTheLeft>(v, h);
 }
 
-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
+ * \householder_module
  * \brief Convenience function for constructing a Householder sequence.
  * \returns A HouseholderSequence constructed from the specified arguments.
  * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 0beef60..904d853 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -51,8 +51,8 @@ class DiagonalPreconditioner {
     compute(mat);
   }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+  constexpr Index rows() const noexcept { return m_invdiag.size(); }
+  constexpr Index cols() const noexcept { return m_invdiag.size(); }
 
   template <typename MatType>
   DiagonalPreconditioner& analyzePattern(const MatType&) {
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index e3154b4..8fdeb84 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -31,8 +31,6 @@ namespace internal {
 template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters,
               typename Dest::RealScalar& tol_error) {
-  using std::abs;
-  using std::sqrt;
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar, Dynamic, 1> VectorType;
@@ -43,14 +41,15 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Precondition
   VectorType r = rhs - mat * x;
   VectorType r0 = r;
 
-  RealScalar r0_sqnorm = r0.squaredNorm();
-  RealScalar rhs_sqnorm = rhs.squaredNorm();
-  if (rhs_sqnorm == 0) {
+  RealScalar r0_norm = r0.stableNorm();
+  RealScalar r_norm = r0_norm;
+  RealScalar rhs_norm = rhs.stableNorm();
+  if (rhs_norm == 0) {
     x.setZero();
     return true;
   }
   Scalar rho(1);
-  Scalar alpha(1);
+  Scalar alpha(0);
   Scalar w(1);
 
   VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
@@ -59,21 +58,22 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Precondition
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol * tol * rhs_sqnorm;
-  RealScalar eps2 = NumTraits<Scalar>::epsilon() * NumTraits<Scalar>::epsilon();
+  RealScalar eps = NumTraits<Scalar>::epsilon();
   Index i = 0;
   Index restarts = 0;
 
-  while (r.squaredNorm() > tol2 && i < maxIters) {
+  while (r_norm > tol && i < maxIters) {
     Scalar rho_old = rho;
-
     rho = r0.dot(r);
-    if (abs(rho) < eps2 * r0_sqnorm) {
+    if (Eigen::numext::abs(rho) / Eigen::numext::maxi(r0_norm, r_norm) < eps * Eigen::numext::mini(r0_norm, r_norm)) {
       // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
       r = rhs - mat * x;
       r0 = r;
-      rho = r0_sqnorm = r.squaredNorm();
+      rho = r.squaredNorm();
+      r0_norm = r.stableNorm();
+      alpha = Scalar(0);
+      w = Scalar(1);
       if (restarts++ == 0) i = 0;
     }
     Scalar beta = (rho / rho_old) * (alpha / w);
@@ -82,23 +82,38 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Precondition
     y = precond.solve(p);
 
     v.noalias() = mat * y;
-
-    alpha = rho / r0.dot(v);
+    Scalar theta = r0.dot(v);
+    // For small angles ∠(r0, v) < eps, random restart.
+    RealScalar v_norm = v.stableNorm();
+    if (Eigen::numext::abs(theta) / Eigen::numext::maxi(r0_norm, v_norm) < eps * Eigen::numext::mini(r0_norm, v_norm)) {
+      r = rhs - mat * x;
+      r0.setRandom();
+      r0_norm = r0.stableNorm();
+      rho = Scalar(1);
+      alpha = Scalar(0);
+      w = Scalar(1);
+      if (restarts++ == 0) i = 0;
+      continue;
+    }
+    alpha = rho / theta;
     s = r - alpha * v;
 
     z = precond.solve(s);
     t.noalias() = mat * z;
 
     RealScalar tmp = t.squaredNorm();
-    if (tmp > RealScalar(0))
+    if (tmp > RealScalar(0)) {
       w = t.dot(s) / tmp;
-    else
+    } else {
       w = Scalar(0);
+    }
     x += alpha * y + w * z;
     r = s - w * t;
+    r_norm = r.stableNorm();
     ++i;
   }
-  tol_error = sqrt(r.squaredNorm() / rhs_sqnorm);
+
+  tol_error = r_norm / rhs_norm;
   iters = i;
   return true;
 }
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index a97b905..dd40058 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -39,9 +39,9 @@ namespace Eigen {
  *
  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly
- * performed on the matrix B, and \sigma = 0. Otherwise, the factorization is performed on the shifted matrix \f$ B +
- * \sigma I \f$ for a shifting factor  \f$ \sigma \f$.  We start with \f$ \sigma = \sigma_0 - \beta \f$, where \f$
- * \sigma_0 \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$
+ * performed on the matrix B, and \f$ \sigma = 0 \f$. Otherwise, the factorization is performed on the shifted matrix
+ * \f$ B + \sigma I \f$ for a shifting factor  \f$ \sigma \f$.  We start with \f$ \sigma = \sigma_0 - \beta \f$, where
+ * \f$ \sigma_0 \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$
  * \sigma_0 = 10^{-3} \f$. If the factorization fails, then the shift in doubled until it succeed or a maximum of ten
  * attempts. If it still fails, as returned by the info() method, then you can either increase the initial shift, or
  * better use another preconditioning technique.
@@ -84,10 +84,10 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar, Up
   }
 
   /** \returns number of rows of the factored matrix */
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+  constexpr Index rows() const noexcept { return m_L.rows(); }
 
   /** \returns number of columns of the factored matrix */
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+  constexpr Index cols() const noexcept { return m_L.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 575a7b2..11ce5e5 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -129,9 +129,15 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageInde
     compute(mat);
   }
 
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+  /** \brief Extraction Method for L-Factor */
+  const FactorType matrixL() const;
 
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  /** \brief Extraction Method for U-Factor */
+  const FactorType matrixU() const;
+
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
+
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *
@@ -207,6 +213,28 @@ void IncompleteLUT<Scalar, StorageIndex>::setFillfactor(int fillfactor) {
   this->m_fillfactor = fillfactor;
 }
 
+/**
+ * get L-Factor
+ * \return L-Factor is a matrix containing the lower triangular part of the sparse matrix. All elements of the matrix
+ * above the main diagonal are zero.
+ **/
+template <typename Scalar, typename StorageIndex>
+const typename IncompleteLUT<Scalar, StorageIndex>::FactorType IncompleteLUT<Scalar, StorageIndex>::matrixL() const {
+  eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+  return m_lu.template triangularView<UnitLower>();
+}
+
+/**
+ * get U-Factor
+ * \return L-Factor is a matrix containing the upper triangular part of the sparse matrix. All elements of the matrix
+ * below the main diagonal are zero.
+ **/
+template <typename Scalar, typename StorageIndex>
+const typename IncompleteLUT<Scalar, StorageIndex>::FactorType IncompleteLUT<Scalar, StorageIndex>::matrixU() const {
+  eigen_assert(m_factorizationIsOk && "Factorization must be computed first.");
+  return m_lu.template triangularView<Upper>();
+}
+
 template <typename Scalar, typename StorageIndex>
 template <typename MatrixType_>
 void IncompleteLUT<Scalar, StorageIndex>::analyzePattern(const MatrixType_& amat) {
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index cf85f2e..5caa396 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -218,10 +218,10 @@ class IterativeSolverBase : public SparseSolverBase<Derived> {
   }
 
   /** \internal */
-  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
+  constexpr Index rows() const noexcept { return matrix().rows(); }
 
   /** \internal */
-  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
+  constexpr Index cols() const noexcept { return matrix().cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria.
    * \sa setTolerance()
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 020241b..271679f 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -23,7 +23,7 @@ class SolveWithGuess;
  *
  * \brief Pseudo expression representing a solving operation
  *
- * \tparam Decomposition the type of the matrix or decomposion object
+ * \tparam Decomposition the type of the matrix or decomposition object
  * \tparam Rhstype the type of the right-hand side
  *
  * This class represents an expression of A.solve(B)
@@ -50,8 +50,8 @@ class SolveWithGuess : public internal::generic_xpr_base<SolveWithGuess<Decompos
   SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
       : m_dec(dec), m_rhs(rhs), m_guess(guess) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index 574021d..9196022 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -100,8 +100,8 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
     if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
   }
 
-  EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
-  EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
+  constexpr Index rows() const noexcept { return mp_matrix.rows(); }
+  constexpr Index cols() const noexcept { return mp_matrix.cols(); }
 
   /** \brief Reports whether previous computation was successful.
    *
@@ -150,7 +150,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
     factorize_impl();
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -182,7 +182,7 @@ class KLU : public SparseSolverBase<KLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the pattern anylysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 466834a..786cd76 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -78,6 +78,17 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
   typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> PermutationPType;
   typedef typename MatrixType::PlainObject PlainObject;
 
+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
+    return Success;
+  }
+
   /**
    * \brief Default Constructor.
    *
@@ -243,7 +254,10 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
       the LU decomposition.
     */
   inline RealScalar rcond() const {
-    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
+    if (!isInvertible()) {
+      return RealScalar(0);
+    }
     return internal::rcond_estimate_helper(m_l1_norm, *this);
   }
 
@@ -388,8 +402,8 @@ class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> >
 
   MatrixType reconstructedMatrix() const;
 
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
-  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lu.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_lu.cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
@@ -714,7 +728,7 @@ void FullPivLU<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs,
 
   // Step 2
   m_lu.topLeftCorner(smalldim, smalldim).template triangularView<UnitLower>().solveInPlace(c.topRows(smalldim));
-  if (rows > cols) c.bottomRows(rows - cols) -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
+  if (rows > cols) c.bottomRows(rows - cols).noalias() -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
 
   // Step 3
   m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 57fd677..fe8859e 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -268,7 +268,7 @@ struct Assignment<DstXprType, Inverse<XprType>,
  * \note This matrix must be invertible, otherwise the result is undefined. If you need an
  * invertibility check, do the following:
  * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
- * \li for the general case, use class FullPivLU.
+ * \li for the general case, use class PartialPivLU.
  *
  * Example: \include MatrixBase_inverse.cpp
  * Output: \verbinclude MatrixBase_inverse.out
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 1edd6b8..7ea14f5 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -90,6 +90,17 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
   typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> TranspositionType;
   typedef typename MatrixType::PlainObject PlainObject;
 
+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return Success;
+  }
+
   /**
    * \brief Default Constructor.
    *
@@ -210,8 +221,8 @@ class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationInde
 
   MatrixType reconstructedMatrix() const;
 
-  EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
-  EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index f0ddb2f..29c9b03 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -24,7 +24,7 @@
 //
 //   Copyright (c) 2001 Intel Corporation.
 //
-// Permition is granted to use, copy, distribute and prepare derivative works
+// Permission is granted to use, copy, distribute and prepare derivative works
 // of this library for any purpose and without fee, provided, that the above
 // copyright notice and this statement appear in all copies.
 // Intel makes no representations about the suitability of this software for
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 7bce3d5..f1ea2ee 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -46,8 +46,8 @@
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
+namespace Eigen {
 namespace internal {
-
 namespace Colamd {
 
 /* Ensure that debugging is turned off: */
@@ -318,7 +318,7 @@ static inline void set_defaults(double knobs[NKnobs]) {
  *
  * \param n_row number of rows in A
  * \param n_col number of columns in A
- * \param Alen, size of the array A
+ * \param Alen size of the array A
  * \param A row indices of the matrix, of size ALen
  * \param p column pointers of A, of size n_col+1
  * \param knobs parameter settings for colamd
@@ -1374,7 +1374,7 @@ static inline void order_children(
 
         /* order this column */
         Col[c].shared2.order = order++;
-        /* collaps tree */
+        /* collapse tree */
         Col[c].shared1.parent = parent;
 
         /* get immediate parent of this column */
@@ -1685,6 +1685,6 @@ static inline IndexType clear_mark /* return the new value for tag_mark */
 }
 
 }  // namespace Colamd
-
 }  // namespace internal
+}  // namespace Eigen
 #endif
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 9a1c535..1a65007 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -13,11 +13,9 @@
 
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
 #include "Eigen_Colamd.h"
 
+namespace Eigen {
 namespace internal {
 
 /** \internal
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 67c1167..c4ca6d3 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -147,7 +147,7 @@ class PardisoImpl : public SparseSolverBase<Derived> {
    * See the PARDISO manual to know how to use it. */
   ParameterType& pardisoParameterArray() { return m_iparm; }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -157,7 +157,7 @@ class PardisoImpl : public SparseSolverBase<Derived> {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index cae9ae4..d173444 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -82,6 +82,17 @@ class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_,
   typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
   typedef typename MatrixType::PlainObject PlainObject;
 
+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return Success;
+  }
+
   /** \brief Default Constructor.
    *
    * The default constructor is useful in cases in which the user intends to
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index e297372..497085d 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -75,6 +75,17 @@ class HouseholderQR : public SolverBase<HouseholderQR<MatrixType_>> {
   typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
       HouseholderSequenceType;
 
+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "HouseHolderQR is not initialized.");
+    return Success;
+  }
+
   /**
    * \brief Default Constructor.
    *
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index f80ddc0..db1e4a2 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -50,26 +50,6 @@ struct traits<BDCSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Options>
   typedef MatrixType_ MatrixType;
 };
 
-template <typename MatrixType, int Options>
-struct allocate_small_svd {
-  static void run(JacobiSVD<MatrixType, Options>& smallSvd, Index rows, Index cols, unsigned int computationOptions) {
-    (void)computationOptions;
-    smallSvd = JacobiSVD<MatrixType, Options>(rows, cols);
-  }
-};
-
-EIGEN_DIAGNOSTICS(push)
-EIGEN_DISABLE_DEPRECATED_WARNING
-
-template <typename MatrixType>
-struct allocate_small_svd<MatrixType, 0> {
-  static void run(JacobiSVD<MatrixType>& smallSvd, Index rows, Index cols, unsigned int computationOptions) {
-    smallSvd = JacobiSVD<MatrixType>(rows, cols, computationOptions);
-  }
-};
-
-EIGEN_DIAGNOSTICS(pop)
-
 }  // end namespace internal
 
 /** \ingroup SVD_Module
@@ -164,42 +144,48 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * Like the default constructor but with preallocation of the internal data
    * according to the specified problem size and the \a computationOptions.
    *
-   * One \b cannot request unitiaries using both the \a Options template parameter
+   * One \b cannot request unitaries using both the \a Options template parameter
    * and the constructor. If possible, prefer using the \a Options template parameter.
    *
-   * \param computationOptions specifification for computing Thin/Full unitaries U/V
+   * \param rows number of rows for the input matrix
+   * \param cols number of columns for the input matrix
+   * \param computationOptions specification for computing Thin/Full unitaries U/V
    * \sa BDCSVD()
    *
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
     allocate(rows, cols, computationOptions);
   }
 
   /** \brief Constructor performing the decomposition of given matrix, using the custom options specified
-   *         with the \a Options template paramter.
+   *         with the \a Options template parameter.
    *
    * \param matrix the matrix to decompose
    */
-  BDCSVD(const MatrixType& matrix) : m_algoswap(16), m_numIters(0) {
+  template <typename Derived>
+  BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
     compute_impl(matrix, internal::get_computation_options(Options));
   }
 
   /** \brief Constructor performing the decomposition of given matrix using specified options
    *         for computing unitaries.
    *
-   *  One \b cannot request unitiaries using both the \a Options template parameter
+   *  One \b cannot request unitaries using both the \a Options template parameter
    *  and the constructor. If possible, prefer using the \a Options template parameter.
    *
    * \param matrix the matrix to decompose
-   * \param computationOptions specifification for computing Thin/Full unitaries U/V
+   * \param computationOptions specification for computing Thin/Full unitaries U/V
    *
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED BDCSVD(const MatrixType& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
     compute_impl(matrix, computationOptions);
   }
@@ -211,7 +197,10 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    *
    * \param matrix the matrix to decompose
    */
-  BDCSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
+  template <typename Derived>
+  BDCSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
 
   /** \brief Method performing the decomposition of given matrix, as specified by
    *         the `computationOptions` parameter.
@@ -222,7 +211,9 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
     return compute_impl(matrix, computationOptions);
   }
@@ -233,7 +224,8 @@ class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
   }
 
  private:
-  BDCSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
+  template <typename Derived>
+  BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
   void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
   void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
   void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
@@ -289,7 +281,7 @@ void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int
   if (Base::allocate(rows, cols, computationOptions)) return;
 
   if (cols < m_algoswap)
-    internal::allocate_small_svd<MatrixType, ComputationOptions>::run(smallSvd, rows, cols, computationOptions);
+    smallSvd.allocate(rows, cols, Options == 0 ? computationOptions : internal::get_computation_options(Options));
 
   m_computed = MatrixXr::Zero(diagSize() + 1, diagSize());
   m_compU = computeV();
@@ -325,8 +317,13 @@ void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int
 }  // end allocate
 
 template <typename MatrixType, int Options>
-BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
+template <typename Derived>
+BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
                                                                        unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "\n\n\n================================================================================================="
                "=====================\n\n\n";
@@ -1048,7 +1045,7 @@ void BDCSVD<MatrixType, Options>::computeSingVals(const ArrayRef& col0, const Ar
       } else {
         // We have a problem as shifting on the left or right give either a positive or negative value
         // at the middle of [left,right]...
-        // Instead fo abbording or entering an infinite loop,
+        // Instead of abbording or entering an infinite loop,
         // let's just use the middle as the estimated zero-crossing:
         muCur = (right - left) * RealScalar(0.5);
         // we can test exact equality here, because shift comes from `... ? left : right`
@@ -1233,7 +1230,7 @@ void BDCSVD<MatrixType, Options>::deflation44(Index firstColu, Index firstColm,
   using std::conj;
   using std::pow;
   using std::sqrt;
-  
+
   RealScalar s = m_computed(firstColm + i, firstColm);
   RealScalar c = m_computed(firstColm + j, firstColm);
   RealScalar r = numext::hypot(c, s);
@@ -1424,8 +1421,7 @@ void BDCSVD<MatrixType, Options>::deflation(Index firstCol, Index lastCol, Index
       if ((diag(i) - diag(i - 1)) < epsilon_strict) {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
         std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i - 1)
-                  << " == " << (diag(i) - diag(i - 1)) << " < "
-                  << epsilon_strict << "\n";
+                  << " == " << (diag(i) - diag(i - 1)) << " < " << epsilon_strict << "\n";
 #endif
         eigen_internal_assert(abs(diag(i) - diag(i - 1)) < epsilon_coarse &&
                               " diagonal entries are not properly sorted");
diff --git a/Eigen/src/SVD/BDCSVD_LAPACKE.h b/Eigen/src/SVD/BDCSVD_LAPACKE.h
index 89d5cbd..5d2b8c7 100644
--- a/Eigen/src/SVD/BDCSVD_LAPACKE.h
+++ b/Eigen/src/SVD/BDCSVD_LAPACKE.h
@@ -58,7 +58,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
   // construct this by moving from a parent object
   BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {}
 
-  void compute_impl_lapacke(const MatrixType& matrix, unsigned int computationOptions) {
+  template <typename Derived>
+  void compute_impl_lapacke(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
     SVD::allocate(matrix.rows(), matrix.cols(), computationOptions);
 
     SVD::m_nonzeroSingularValues = SVD::m_diagSize;
@@ -120,8 +121,8 @@ class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
   }
 };
 
-template <typename MatrixType_, int Options>
-BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixType_& matrix,
+template <typename MatrixType_, int Options, typename Derived>
+BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixBase<Derived>& matrix,
                                              int computationOptions) {
   // we need to move to the wrapper type and back
   BDCSVD_LAPACKE<MatrixType_, Options> tmpSvd(std::move(svd));
@@ -134,12 +135,13 @@ BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd,
 
 }  // end namespace internal
 
-#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS)                                                                 \
-  template <>                                                                                                          \
-  inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                              \
-  BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                       \
-      const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
-    return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions);                               \
+#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS)                                           \
+  template <>                                                                                    \
+  template <typename Derived>                                                                    \
+  inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&        \
+  BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                      \
+    return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions);         \
   }
 
 #define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS)        \
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 086d750..da2f295 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -547,23 +547,29 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    * One \b cannot request unitaries using both the \a Options template parameter
    * and the constructor. If possible, prefer using the \a Options template parameter.
    *
+   * \param rows number of rows for the input matrix
+   * \param cols number of columns for the input matrix
    * \param computationOptions specify whether to compute Thin/Full unitaries U/V
    * \sa JacobiSVD()
    *
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED JacobiSVD(Index rows, Index cols, unsigned int computationOptions) {
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD(Index rows, Index cols, unsigned int computationOptions) {
     internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
     allocate(rows, cols, computationOptions);
   }
 
   /** \brief Constructor performing the decomposition of given matrix, using the custom options specified
-   *         with the \a Options template paramter.
+   *         with the \a Options template parameter.
    *
    * \param matrix the matrix to decompose
    */
-  explicit JacobiSVD(const MatrixType& matrix) { compute_impl(matrix, internal::get_computation_options(Options)); }
+  template <typename Derived>
+  explicit JacobiSVD(const MatrixBase<Derived>& matrix) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }
 
   /** \brief Constructor performing the decomposition of given matrix using specified options
    *         for computing unitaries.
@@ -578,8 +584,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    * be specified in the \a Options template parameter.
    */
   // EIGEN_DEPRECATED // TODO(cantonios): re-enable after fixing a few 3p libraries that error on deprecation warnings.
-  JacobiSVD(const MatrixType& matrix, unsigned int computationOptions) {
-    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
+  template <typename Derived>
+  JacobiSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(computationOptions, matrix.rows(),
+                                                                         matrix.cols());
     compute_impl(matrix, computationOptions);
   }
 
@@ -588,7 +596,10 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    *
    * \param matrix the matrix to decompose
    */
-  JacobiSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); }
+  template <typename Derived>
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
 
   /** \brief Method performing the decomposition of given matrix, as specified by
    *         the `computationOptions` parameter.
@@ -599,8 +610,11 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
    * \deprecated Will be removed in the next major Eigen version. Options should
    * be specified in the \a Options template parameter.
    */
-  EIGEN_DEPRECATED JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions) {
-    internal::check_svd_options_assertions<MatrixType, Options>(m_computationOptions, matrix.rows(), matrix.cols());
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(m_computationOptions, matrix.rows(),
+                                                                         matrix.cols());
     return compute_impl(matrix, computationOptions);
   }
 
@@ -611,7 +625,6 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
   using Base::rank;
   using Base::rows;
 
- private:
   void allocate(Index rows_, Index cols_, unsigned int computationOptions) {
     if (Base::allocate(rows_, cols_, computationOptions)) return;
     eigen_assert(!(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) &&
@@ -624,7 +637,9 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
     if (rows() > cols()) m_qr_precond_morerows.allocate(*this);
   }
 
-  JacobiSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions);
+ private:
+  template <typename Derived>
+  JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
 
  protected:
   using Base::m_computationOptions;
@@ -662,8 +677,13 @@ class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
 };
 
 template <typename MatrixType, int Options>
-JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixType& matrix,
+template <typename Derived>
+JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
                                                                              unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
   using std::abs;
 
   allocate(matrix.rows(), matrix.cols(), computationOptions);
diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
index df6a096..db26366 100644
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -40,65 +40,65 @@ namespace Eigen {
 
 /** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS)    \
-  template <>                                                                                                          \
-  inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                           \
-  JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                    \
-      const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) { \
-    typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType;                                 \
-    /*typedef MatrixType::Scalar Scalar;*/                                                                             \
-    /*typedef MatrixType::RealScalar RealScalar;*/                                                                     \
-    allocate(matrix.rows(), matrix.cols(), computationOptions);                                                        \
-                                                                                                                       \
-    /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/                                     \
-    m_nonzeroSingularValues = diagSize();                                                                              \
-                                                                                                                       \
-    lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt;                             \
-    lapack_int matrix_order = LAPACKE_COLROW;                                                                          \
-    char jobu, jobvt;                                                                                                  \
-    LAPACKE_TYPE *u, *vt, dummy;                                                                                       \
-    jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N';                                                      \
-    jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N';                                                     \
-    if (computeU()) {                                                                                                  \
-      ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride());                                              \
-      u = (LAPACKE_TYPE*)m_matrixU.data();                                                                             \
-    } else {                                                                                                           \
-      ldu = 1;                                                                                                         \
-      u = &dummy;                                                                                                      \
-    }                                                                                                                  \
-    MatrixType localV;                                                                                                 \
-    lapack_int vt_rows = (m_computeFullV)   ? internal::convert_index<lapack_int>(cols())                              \
-                         : (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize())                          \
-                                            : 1;                                                                       \
-    if (computeV()) {                                                                                                  \
-      localV.resize(vt_rows, cols());                                                                                  \
-      ldvt = internal::convert_index<lapack_int>(localV.outerStride());                                                \
-      vt = (LAPACKE_TYPE*)localV.data();                                                                               \
-    } else {                                                                                                           \
-      ldvt = 1;                                                                                                        \
-      vt = &dummy;                                                                                                     \
-    }                                                                                                                  \
-    Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb;                                                                    \
-    superb.resize(diagSize(), 1);                                                                                      \
-    MatrixType m_temp;                                                                                                 \
-    m_temp = matrix;                                                                                                   \
-    lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd(                                                                 \
-        matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()),                                        \
-        internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda,                                \
-        (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data());                                     \
-    /* Check the result of the LAPACK call */                                                                          \
-    if (info < 0 || !m_singularValues.allFinite()) {                                                                   \
-      m_info = InvalidInput;                                                                                           \
-    } else if (info > 0) {                                                                                             \
-      m_info = NoConvergence;                                                                                          \
-    } else {                                                                                                           \
-      m_info = Success;                                                                                                \
-      if (computeV()) m_matrixV = localV.adjoint();                                                                    \
-    }                                                                                                                  \
-    /* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--;        \
-     * m_singularValues.coeffRef(i)=RealScalar(0);}*/                                                                  \
-    m_isInitialized = true;                                                                                            \
-    return *this;                                                                                                      \
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \
+  template <>                                                                                                       \
+  template <typename Derived>                                                                                       \
+  inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                        \
+  JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                 \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                                         \
+    /*typedef MatrixType::Scalar Scalar;*/                                                                          \
+    /*typedef MatrixType::RealScalar RealScalar;*/                                                                  \
+    allocate(matrix.rows(), matrix.cols(), computationOptions);                                                     \
+                                                                                                                    \
+    /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/                                  \
+    m_nonzeroSingularValues = diagSize();                                                                           \
+                                                                                                                    \
+    lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt;                          \
+    lapack_int matrix_order = LAPACKE_COLROW;                                                                       \
+    char jobu, jobvt;                                                                                               \
+    LAPACKE_TYPE *u, *vt, dummy;                                                                                    \
+    jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N';                                                   \
+    jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N';                                                  \
+    if (computeU()) {                                                                                               \
+      ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride());                                           \
+      u = (LAPACKE_TYPE*)m_matrixU.data();                                                                          \
+    } else {                                                                                                        \
+      ldu = 1;                                                                                                      \
+      u = &dummy;                                                                                                   \
+    }                                                                                                               \
+    MatrixType localV;                                                                                              \
+    lapack_int vt_rows = (m_computeFullV)   ? internal::convert_index<lapack_int>(cols())                           \
+                         : (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize())                       \
+                                            : 1;                                                                    \
+    if (computeV()) {                                                                                               \
+      localV.resize(vt_rows, cols());                                                                               \
+      ldvt = internal::convert_index<lapack_int>(localV.outerStride());                                             \
+      vt = (LAPACKE_TYPE*)localV.data();                                                                            \
+    } else {                                                                                                        \
+      ldvt = 1;                                                                                                     \
+      vt = &dummy;                                                                                                  \
+    }                                                                                                               \
+    Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb;                                                                 \
+    superb.resize(diagSize(), 1);                                                                                   \
+    MatrixType m_temp;                                                                                              \
+    m_temp = matrix;                                                                                                \
+    lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd(                                                              \
+        matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()),                                     \
+        internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda,                             \
+        (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data());                                  \
+    /* Check the result of the LAPACK call */                                                                       \
+    if (info < 0 || !m_singularValues.allFinite()) {                                                                \
+      m_info = InvalidInput;                                                                                        \
+    } else if (info > 0) {                                                                                          \
+      m_info = NoConvergence;                                                                                       \
+    } else {                                                                                                        \
+      m_info = Success;                                                                                             \
+      if (computeV()) m_matrixV = localV.adjoint();                                                                 \
+    }                                                                                                               \
+    /* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--;     \
+     * m_singularValues.coeffRef(i)=RealScalar(0);}*/                                                               \
+    m_isInitialized = true;                                                                                         \
+    return *this;                                                                                                   \
   }
 
 #define EIGEN_LAPACK_SVD_OPTIONS(OPTIONS)                                                            \
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index d1ad63d..dcb4dba 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -379,7 +379,7 @@ void SVDBase<Derived>::_solve_impl(const RhsType& rhs, DstType& dst) const {
   Index l_rank = rank();
   tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
-  dst = m_matrixV.leftCols(l_rank) * tmp;
+  dst.noalias() = m_matrixV.leftCols(l_rank) * tmp;
 }
 
 template <typename Derived>
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index d78b30b..6df6318 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -172,7 +172,7 @@ void upperbidiagonalization_blocked_helper(
     // 1 - update the k-th column of A
     SubColumnType v_k = A.col(k).tail(remainingRows);
     v_k -= V_k1 * Y.row(k).head(k).adjoint();
-    if (k) v_k -= X_k1 * A.col(k).head(k);
+    if (k) v_k.noalias() -= X_k1 * A.col(k).head(k);
 
     // 2 - construct left Householder transform in-place
     v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
@@ -203,7 +203,7 @@ void upperbidiagonalization_blocked_helper(
       SubRowType u_k(A.row(k).tail(remainingCols));
       u_k = u_k.conjugate();
       {
-        u_k -= Y_k * A.row(k).head(k + 1).adjoint();
+        u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint();
         if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
       }
 
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index f3ce975..d8e2944 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -134,7 +134,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> {
       << "\n";
     s << "  tree:     " << ((total += m_parent.size() * sizeof(int)) >> 20) << "Mb"
       << "\n";
-    s << "  nonzeros: " << ((total += m_nonZerosPerCol.size() * sizeof(int)) >> 20) << "Mb"
+    s << "  nonzeros: " << ((total += m_workSpace.size() * sizeof(int)) >> 20) << "Mb"
       << "\n";
     s << "  perm:     " << ((total += m_P.size() * sizeof(int)) >> 20) << "Mb"
       << "\n";
@@ -240,7 +240,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> {
   CholMatrixType m_matrix;
   VectorType m_diag;  // the diagonal coefficients (LDLT mode)
   VectorI m_parent;   // elimination tree
-  VectorI m_nonZerosPerCol;
+  VectorI m_workSpace;
   PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;     // the permutation
   PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;  // the inverse permutation
 
@@ -406,7 +406,7 @@ class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<MatrixType_, U
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -416,7 +416,7 @@ class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<MatrixType_, U
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -494,7 +494,7 @@ class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<MatrixType_,
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -504,7 +504,7 @@ class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<MatrixType_,
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -575,7 +575,7 @@ class SimplicialNonHermitianLLT
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -585,7 +585,7 @@ class SimplicialNonHermitianLLT
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -664,7 +664,7 @@ class SimplicialNonHermitianLDLT
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -674,7 +674,7 @@ class SimplicialNonHermitianLDLT
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -742,7 +742,7 @@ class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<Matr
     return *this;
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -757,7 +757,7 @@ class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<Matr
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -830,7 +830,7 @@ void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMat
   const Index size = a.rows();
   pmat = &ap;
   // Note that ordering methods compute the inverse permutation
-  if (!internal::is_same<OrderingType, NaturalOrdering<Index> >::value) {
+  if (!internal::is_same<OrderingType, NaturalOrdering<StorageIndex> >::value) {
     {
       CholMatrixType C;
       internal::permute_symm_to_fullsymm<UpLo, NonHermitian>(a, C, NULL);
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 0b13c56..3c65541 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -25,40 +25,269 @@ the Mozilla Public License v. 2.0, as stated at the top of this file.
 
 namespace Eigen {
 
-template <typename Derived>
-void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT) {
-  const StorageIndex size = StorageIndex(ap.rows());
-  m_matrix.resize(size, size);
-  m_parent.resize(size);
-  m_nonZerosPerCol.resize(size);
+namespace internal {
 
-  ei_declare_aligned_stack_constructed_variable(StorageIndex, tags, size, 0);
+template <typename Scalar, typename StorageIndex>
+struct simpl_chol_helper {
+  using CholMatrixType = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using InnerIterator = typename CholMatrixType::InnerIterator;
+  using VectorI = Matrix<StorageIndex, Dynamic, 1>;
+  static constexpr StorageIndex kEmpty = -1;
 
-  for (StorageIndex k = 0; k < size; ++k) {
-    /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
-    m_parent[k] = -1;        /* parent of k is not yet known */
-    tags[k] = k;             /* mark node k as visited */
-    m_nonZerosPerCol[k] = 0; /* count of nonzeros in column k of L */
-    for (typename CholMatrixType::InnerIterator it(ap, k); it; ++it) {
-      StorageIndex i = it.index();
-      if (i < k) {
-        /* follow path from i to root of etree, stop at flagged node */
-        for (; tags[i] != k; i = m_parent[i]) {
-          /* find parent of i if not yet determined */
-          if (m_parent[i] == -1) m_parent[i] = k;
-          m_nonZerosPerCol[i]++; /* L (k,i) is nonzero */
-          tags[i] = k;           /* mark i as visited */
+  // Implementation of a stack or last-in first-out structure with some debugging machinery.
+  struct Stack {
+    StorageIndex* m_data;
+    Index m_size;
+#ifndef EIGEN_NO_DEBUG
+    const Index m_maxSize;
+    Stack(StorageIndex* data, StorageIndex size, StorageIndex maxSize)
+        : m_data(data), m_size(size), m_maxSize(maxSize) {
+      eigen_assert(size >= 0);
+      eigen_assert(maxSize >= size);
+    }
+#else
+    Stack(StorageIndex* data, StorageIndex size, StorageIndex /*maxSize*/) : m_data(data), m_size(size) {}
+#endif
+    bool empty() const { return m_size == 0; }
+    Index size() const { return m_size; }
+    StorageIndex back() const {
+      eigen_assert(m_size > 0);
+      return m_data[m_size - 1];
+    }
+    void push(const StorageIndex& value) {
+#ifndef EIGEN_NO_DEBUG
+      eigen_assert(m_size < m_maxSize);
+#endif
+      m_data[m_size] = value;
+      m_size++;
+    }
+    void pop() {
+      eigen_assert(m_size > 0);
+      m_size--;
+    }
+  };
+
+  // Implementation of a disjoint-set or union-find structure with path compression.
+  struct DisjointSet {
+    StorageIndex* m_set;
+    DisjointSet(StorageIndex* set, StorageIndex size) : m_set(set) { std::iota(set, set + size, 0); }
+    // Find the set representative or root of `u`.
+    StorageIndex find(StorageIndex u) const {
+      eigen_assert(u != kEmpty);
+      while (m_set[u] != u) {
+        // manually unroll the loop by a factor of 2 to improve performance
+        u = m_set[m_set[u]];
+      }
+      return u;
+    }
+    // Perform full path compression such that each node from `u` to `v` points to `v`.
+    void compress(StorageIndex u, StorageIndex v) {
+      eigen_assert(u != kEmpty);
+      eigen_assert(v != kEmpty);
+      while (m_set[u] != v) {
+        StorageIndex next = m_set[u];
+        m_set[u] = v;
+        u = next;
+      }
+    };
+  };
+
+  // Computes the higher adjacency pattern by transposing the input lower adjacency matrix.
+  // Only the index arrays are calculated, as the values are not needed for the symbolic factorization.
+  // The outer index array provides the size requirements of the inner index array.
+
+  // Computes the outer index array of the higher adjacency matrix.
+  static void calc_hadj_outer(const StorageIndex size, const CholMatrixType& ap, StorageIndex* outerIndex) {
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) outerIndex[i + 1]++;
+      }
+    }
+    std::partial_sum(outerIndex, outerIndex + size + 1, outerIndex);
+  }
+
+  // inner index array
+  static void calc_hadj_inner(const StorageIndex size, const CholMatrixType& ap, const StorageIndex* outerIndex,
+                              StorageIndex* innerIndex, StorageIndex* tmp) {
+    std::fill_n(tmp, size, 0);
+
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) {
+          StorageIndex b = outerIndex[i] + tmp[i];
+          innerIndex[b] = j;
+          tmp[i]++;
+        }
+      }
+    }
+  }
+
+  // Adapted from:
+  // Joseph W. Liu. (1986).
+  // A compact row storage scheme for Cholesky factors using elimination trees.
+  // ACM Trans. Math. Softw. 12, 2 (June 1986), 127-148. https://doi.org/10.1145/6497.6499
+
+  // Computes the elimination forest of the lower adjacency matrix, a compact representation of the sparse L factor.
+  // The L factor may contain multiple elimination trees if a column contains only its diagonal element.
+  // Each elimination tree is an n-ary tree in which each node points to its parent.
+  static void calc_etree(const StorageIndex size, const CholMatrixType& ap, StorageIndex* parent, StorageIndex* tmp) {
+    std::fill_n(parent, size, kEmpty);
+
+    DisjointSet ancestor(tmp, size);
+
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) {
+          StorageIndex r = ancestor.find(i);
+          if (r != j) parent[r] = j;
+          ancestor.compress(i, j);
+        }
+      }
+    }
+  }
+
+  // Computes the child pointers of the parent tree to facilitate a depth-first search traversal.
+  static void calc_lineage(const StorageIndex size, const StorageIndex* parent, StorageIndex* firstChild,
+                           StorageIndex* firstSibling) {
+    std::fill_n(firstChild, size, kEmpty);
+    std::fill_n(firstSibling, size, kEmpty);
+
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      if (p == kEmpty) continue;
+      StorageIndex c = firstChild[p];
+      if (c == kEmpty)
+        firstChild[p] = j;
+      else {
+        while (firstSibling[c] != kEmpty) c = firstSibling[c];
+        firstSibling[c] = j;
+      }
+    }
+  }
+
+  // Computes a post-ordered traversal of the elimination tree.
+  static void calc_post(const StorageIndex size, const StorageIndex* parent, StorageIndex* firstChild,
+                        const StorageIndex* firstSibling, StorageIndex* post, StorageIndex* dfs) {
+    Stack post_stack(post, 0, size);
+    for (StorageIndex j = 0; j < size; j++) {
+      if (parent[j] != kEmpty) continue;
+      // Begin at a root
+      Stack dfs_stack(dfs, 0, size);
+      dfs_stack.push(j);
+      while (!dfs_stack.empty()) {
+        StorageIndex i = dfs_stack.back();
+        StorageIndex c = firstChild[i];
+        if (c == kEmpty) {
+          post_stack.push(i);
+          dfs_stack.pop();
+        } else {
+          dfs_stack.push(c);
+          // Remove the path from `i` to `c` for future traversals.
+          firstChild[i] = firstSibling[c];
+        }
+      }
+    }
+    eigen_assert(post_stack.size() == size);
+    eigen_assert(std::all_of(firstChild, firstChild + size, [](StorageIndex a) { return a == kEmpty; }));
+  }
+
+  // Adapted from:
+  // Gilbert, J. R., Ng, E., & Peyton, B. W. (1994).
+  // An efficient algorithm to compute row and column counts for sparse Cholesky factorization.
+  // SIAM Journal on Matrix Analysis and Applications, 15(4), 1075-1091.
+
+  // Computes the non-zero pattern of the L factor.
+  static void calc_colcount(const StorageIndex size, const StorageIndex* hadjOuter, const StorageIndex* hadjInner,
+                            const StorageIndex* parent, StorageIndex* prevLeaf, StorageIndex* tmp,
+                            const StorageIndex* post, StorageIndex* nonZerosPerCol, bool doLDLT) {
+    // initialize nonZerosPerCol with 1 for leaves, 0 for non-leaves
+    std::fill_n(nonZerosPerCol, size, 1);
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      // p is not a leaf
+      if (p != kEmpty) nonZerosPerCol[p] = 0;
+    }
+
+    DisjointSet parentSet(tmp, size);
+    // prevLeaf is already initialized
+    eigen_assert(std::all_of(prevLeaf, prevLeaf + size, [](StorageIndex a) { return a == kEmpty; }));
+
+    for (StorageIndex j_ = 0; j_ < size; j_++) {
+      StorageIndex j = post[j_];
+      nonZerosPerCol[j] += hadjOuter[j + 1] - hadjOuter[j];
+      for (StorageIndex k = hadjOuter[j]; k < hadjOuter[j + 1]; k++) {
+        StorageIndex i = hadjInner[k];
+        eigen_assert(i > j);
+        StorageIndex prev = prevLeaf[i];
+        if (prev != kEmpty) {
+          StorageIndex q = parentSet.find(prev);
+          parentSet.compress(prev, q);
+          nonZerosPerCol[q]--;
         }
+        prevLeaf[i] = j;
       }
+      StorageIndex p = parent[j];
+      if (p != kEmpty) parentSet.compress(j, p);
     }
+
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      if (p != kEmpty) nonZerosPerCol[p] += nonZerosPerCol[j] - 1;
+      if (doLDLT) nonZerosPerCol[j]--;
+    }
+  }
+
+  // Finalizes the non zero pattern of the L factor and allocates the memory for the factorization.
+  static void init_matrix(const StorageIndex size, const StorageIndex* nonZerosPerCol, CholMatrixType& L) {
+    eigen_assert(L.outerIndexPtr()[0] == 0);
+    std::partial_sum(nonZerosPerCol, nonZerosPerCol + size, L.outerIndexPtr() + 1);
+    L.resizeNonZeros(L.outerIndexPtr()[size]);
+  }
+
+  // Driver routine for the symbolic sparse Cholesky factorization.
+  static void run(const StorageIndex size, const CholMatrixType& ap, CholMatrixType& L, VectorI& parent,
+                  VectorI& workSpace, bool doLDLT) {
+    parent.resize(size);
+    workSpace.resize(4 * size);
+    L.resize(size, size);
+
+    StorageIndex* tmp1 = workSpace.data();
+    StorageIndex* tmp2 = workSpace.data() + size;
+    StorageIndex* tmp3 = workSpace.data() + 2 * size;
+    StorageIndex* tmp4 = workSpace.data() + 3 * size;
+
+    // Borrow L's outer index array for the higher adjacency pattern.
+    StorageIndex* hadj_outer = L.outerIndexPtr();
+    calc_hadj_outer(size, ap, hadj_outer);
+    // Request additional temporary storage for the inner indices of the higher adjacency pattern.
+    ei_declare_aligned_stack_constructed_variable(StorageIndex, hadj_inner, hadj_outer[size], nullptr);
+    calc_hadj_inner(size, ap, hadj_outer, hadj_inner, tmp1);
+
+    calc_etree(size, ap, parent.data(), tmp1);
+    calc_lineage(size, parent.data(), tmp1, tmp2);
+    calc_post(size, parent.data(), tmp1, tmp2, tmp3, tmp4);
+    calc_colcount(size, hadj_outer, hadj_inner, parent.data(), tmp1, tmp2, tmp3, tmp4, doLDLT);
+    init_matrix(size, tmp4, L);
   }
+};
+
+// Symbol is ODR-used, so we need a definition.
+template <typename Scalar, typename StorageIndex>
+constexpr StorageIndex simpl_chol_helper<Scalar, StorageIndex>::kEmpty;
+
+}  // namespace internal
+
+template <typename Derived>
+void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT) {
+  using Helper = internal::simpl_chol_helper<Scalar, StorageIndex>;
 
-  /* construct Lp index array from m_nonZerosPerCol column counts */
-  StorageIndex* Lp = m_matrix.outerIndexPtr();
-  Lp[0] = 0;
-  for (StorageIndex k = 0; k < size; ++k) Lp[k + 1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
+  eigen_assert(ap.innerSize() == ap.outerSize());
+  const StorageIndex size = internal::convert_index<StorageIndex>(ap.outerSize());
 
-  m_matrix.resizeNonZeros(Lp[size]);
+  Helper::run(size, ap, m_matrix, m_parent, m_workSpace, doLDLT);
 
   m_isInitialized = true;
   m_info = Success;
@@ -70,20 +299,21 @@ template <typename Derived>
 template <bool DoLDLT, bool NonHermitian>
 void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& ap) {
   using std::sqrt;
+  const StorageIndex size = StorageIndex(ap.rows());
 
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
   eigen_assert(ap.rows() == ap.cols());
-  eigen_assert(m_parent.size() == ap.rows());
-  eigen_assert(m_nonZerosPerCol.size() == ap.rows());
+  eigen_assert(m_parent.size() == size);
+  eigen_assert(m_workSpace.size() >= 3 * size);
 
-  const StorageIndex size = StorageIndex(ap.rows());
   const StorageIndex* Lp = m_matrix.outerIndexPtr();
   StorageIndex* Li = m_matrix.innerIndexPtr();
   Scalar* Lx = m_matrix.valuePtr();
 
   ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
-  ei_declare_aligned_stack_constructed_variable(StorageIndex, pattern, size, 0);
-  ei_declare_aligned_stack_constructed_variable(StorageIndex, tags, size, 0);
+  StorageIndex* nonZerosPerCol = m_workSpace.data();
+  StorageIndex* pattern = m_workSpace.data() + size;
+  StorageIndex* tags = m_workSpace.data() + 2 * size;
 
   bool ok = true;
   m_diag.resize(DoLDLT ? size : 0);
@@ -93,7 +323,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     y[k] = Scalar(0);         // Y(0:k) is now all zero
     StorageIndex top = size;  // stack for pattern is empty
     tags[k] = k;              // mark node k as visited
-    m_nonZerosPerCol[k] = 0;  // count of nonzeros in column k of L
+    nonZerosPerCol[k] = 0;    // count of nonzeros in column k of L
     for (typename CholMatrixType::InnerIterator it(ap, k); it; ++it) {
       StorageIndex i = it.index();
       if (i <= k) {
@@ -124,13 +354,13 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
       else
         yi = l_ki = yi / Lx[Lp[i]];
 
-      Index p2 = Lp[i] + m_nonZerosPerCol[i];
+      Index p2 = Lp[i] + nonZerosPerCol[i];
       Index p;
       for (p = Lp[i] + (DoLDLT ? 0 : 1); p < p2; ++p) y[Li[p]] -= getSymm(Lx[p]) * yi;
       d -= getDiag(l_ki * getSymm(yi));
       Li[p] = k; /* store L(k,i) in column form of L */
       Lx[p] = l_ki;
-      ++m_nonZerosPerCol[i]; /* increment count of nonzeros in col i */
+      ++nonZerosPerCol[i]; /* increment count of nonzeros in col i */
     }
     if (DoLDLT) {
       m_diag[k] = d;
@@ -139,7 +369,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
         break;
       }
     } else {
-      Index p = Lp[k] + m_nonZerosPerCol[k]++;
+      Index p = Lp[k] + nonZerosPerCol[k]++;
       Li[p] = k; /* store L(k,k) = sqrt (d) in column k */
       if (NonHermitian ? d == RealScalar(0) : numext::real(d) <= RealScalar(0)) {
         ok = false; /* failure, matrix is not positive definite */
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 6858263..7fcf2c2 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -834,7 +834,7 @@ struct sparse_disjunction_evaluator<XprType, IteratorBased, IndexBased> : evalua
   const XprType& m_expr;
 };
 
-// when DupFunc is wrapped with scalar_dup_op, use disjunction evaulator
+// when DupFunc is wrapped with scalar_dup_op, use disjunction evaluator
 template <typename T1, typename T2, typename DupFunc, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<scalar_disjunction_op<DupFunc, T1, T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
     : sparse_disjunction_evaluator<CwiseBinaryOp<scalar_disjunction_op<DupFunc, T1, T2>, Lhs, Rhs> > {
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index db70810..17ce596 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -45,7 +45,6 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
 
     Index n = lhs.outerSize();
 #ifdef EIGEN_HAS_OPENMP
-    Eigen::initParallel();
     Index threads = Eigen::nbThreads();
 #endif
 
@@ -125,7 +124,6 @@ struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType,
     LhsEval lhsEval(lhs);
 
 #ifdef EIGEN_HAS_OPENMP
-    Eigen::initParallel();
     Index threads = Eigen::nbThreads();
     // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
     // It basically represents the minimal amount of work to be done to be worth it.
diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h
index f040915..485605f 100644
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h
@@ -36,10 +36,10 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
   Scalar res1(0);
   Scalar res2(0);
   for (; i; ++i) {
-    res1 += numext::conj(i.value()) * other.coeff(i.index());
+    res1 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res1);
     ++i;
     if (i) {
-      res2 += numext::conj(i.value()) * other.coeff(i.index());
+      res2 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res2);
     }
   }
   return res1 + res2;
@@ -67,7 +67,7 @@ inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot
   Scalar res(0);
   while (i && j) {
     if (i.index() == j.index()) {
-      res += numext::conj(i.value()) * j.value();
+      res = numext::madd<Scalar>(numext::conj(i.value()), j.value(), res);
       ++i;
       ++j;
     } else if (i.index() < j.index())
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 849970a..8fcdfdf 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -202,9 +202,9 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
   inline StorageIndex* innerNonZeroPtr() { return m_innerNonZeros; }
 
   /** \internal */
-  inline Storage& data() { return m_data; }
+  constexpr Storage& data() { return m_data; }
   /** \internal */
-  inline const Storage& data() const { return m_data; }
+  constexpr const Storage& data() const { return m_data; }
 
   /** \returns the value of the matrix at position \a i, \a j
    * This function returns Scalar(0) if the element is an explicit \em zero */
@@ -250,7 +250,7 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
       }
     }
     if ((dst < end) && (m_data.index(dst) == inner)) {
-      // this coefficient exists, return a refernece to it
+      // this coefficient exists, return a reference to it
       if (inserted != nullptr) {
         *inserted = false;
       }
@@ -302,9 +302,10 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
    */
   inline void setZero() {
     m_data.clear();
-    std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
+    using std::fill_n;
+    fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
     if (m_innerNonZeros) {
-      std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+      fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
     }
   }
 
@@ -506,7 +507,7 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
 
   // insert empty outer vectors at indices j, j+1 ... j+num-1 and resize the matrix
   void insertEmptyOuterVectors(Index j, Index num = 1) {
-    EIGEN_USING_STD(fill_n);
+    using std::fill_n;
     eigen_assert(num >= 0 && j >= 0 && j < m_outerSize && "Invalid parameters");
 
     const Index newRows = IsRowMajor ? m_outerSize + num : rows();
@@ -563,6 +564,8 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
   /** \internal
    * same as insert(Index,Index) except that the indices are given relative to the storage order */
   Scalar& insertByOuterInner(Index j, Index i) {
+    eigen_assert(j >= 0 && j < m_outerSize && "invalid outer index");
+    eigen_assert(i >= 0 && i < m_innerSize && "invalid inner index");
     Index start = m_outerIndex[j];
     Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
     Index dst = start == end ? end : m_data.searchLowerIndex(start, end, i);
@@ -619,10 +622,12 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
   void uncompress() {
     if (!isCompressed()) return;
     m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
-    if (m_outerIndex[m_outerSize] == 0)
-      std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
-    else
+    if (m_outerIndex[m_outerSize] == 0) {
+      using std::fill_n;
+      fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+    } else {
       for (Index j = 0; j < m_outerSize; j++) m_innerNonZeros[j] = m_outerIndex[j + 1] - m_outerIndex[j];
+    }
   }
 
   /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
@@ -693,9 +698,10 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
 
       if (outerChange > 0) {
         StorageIndex lastIdx = m_outerSize == 0 ? StorageIndex(0) : m_outerIndex[m_outerSize];
-        std::fill_n(m_outerIndex + m_outerSize, outerChange + 1, lastIdx);
+        using std::fill_n;
+        fill_n(m_outerIndex + m_outerSize, outerChange + 1, lastIdx);
 
-        if (!isCompressed()) std::fill_n(m_innerNonZeros + m_outerSize, outerChange, StorageIndex(0));
+        if (!isCompressed()) fill_n(m_innerNonZeros + m_outerSize, outerChange, StorageIndex(0));
       }
     }
     m_outerSize = newOuterSize;
@@ -739,7 +745,8 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
     internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
     m_innerNonZeros = 0;
 
-    std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
+    using std::fill_n;
+    fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
   }
 
   /** \internal
@@ -827,6 +834,8 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
     std::swap(m_innerNonZeros, other.m_innerNonZeros);
     m_data.swap(other.m_data);
   }
+  /** Free-function swap. */
+  friend EIGEN_DEVICE_FUNC void swap(SparseMatrix& a, SparseMatrix& b) { a.swap(b); }
 
   /** Sets *this to the identity matrix.
    * This function also turns the matrix into compressed mode, and drop any reserved memory. */
@@ -839,7 +848,8 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
     m_data.squeeze();
     std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
     std::iota(innerIndexPtr(), innerIndexPtr() + m_outerSize, StorageIndex(0));
-    std::fill_n(valuePtr(), m_outerSize, Scalar(1));
+    using std::fill_n;
+    fill_n(valuePtr(), m_outerSize, Scalar(1));
   }
 
   inline SparseMatrix& operator=(const SparseMatrix& other) {
@@ -865,7 +875,6 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
     return *this;
   }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename OtherDerived>
   inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other) {
     return Base::operator=(other.derived());
@@ -873,7 +882,6 @@ class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_,
 
   template <typename Lhs, typename Rhs>
   inline SparseMatrix& operator=(const Product<Lhs, Rhs, AliasFreeProduct>& other);
-#endif  // EIGEN_PARSED_BY_DOXYGEN
 
   template <typename OtherDerived>
   EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
@@ -1122,7 +1130,11 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   using TransposedSparseMatrix =
       SparseMatrix<typename SparseMatrixType::Scalar, IsRowMajor ? ColMajor : RowMajor, StorageIndex>;
 
-  if (begin == end) return;
+  if (begin == end) {
+    // Clear out existing data (if any).
+    mat.setZero();
+    return;
+  }
 
   // There are two strategies to consider for constructing a matrix from unordered triplets:
   // A) construct the 'mat' in its native storage order and sort in-place (less memory); or,
@@ -1224,8 +1236,8 @@ void set_from_triplets_sorted(const InputIterator& begin, const InputIterator& e
   // matrix is finalized
 }
 
-// thin wrapper around a generic binary functor to use the sparse disjunction evaulator instead of the default
-// "arithmetic" evaulator
+// thin wrapper around a generic binary functor to use the sparse disjunction evaluator instead of the default
+// "arithmetic" evaluator
 template <typename DupFunctor, typename LhsScalar, typename RhsScalar = LhsScalar>
 struct scalar_disjunction_op {
   using result_type = typename result_of<DupFunctor(LhsScalar, RhsScalar)>::type;
@@ -1631,7 +1643,7 @@ SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(I
   // first, check if there is adequate allocated memory
   if (m_data.allocatedSize() <= m_data.size()) {
     // if there is no capacity for a single insertion, double the capacity
-    // increase capacity by a mininum of 32
+    // increase capacity by a minimum of 32
     Index minReserve = 32;
     Index reserveSize = numext::maxi(minReserve, m_data.allocatedSize());
     m_data.reserve(reserveSize);
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index b58bb38..ccbbe98 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -118,7 +118,6 @@ class SparseMatrixBase : public EigenBase<Derived> {
   // FIXME storage order do not match evaluator storage order
   typedef SparseMatrix<Scalar, Flags & RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
   /** This is the "real scalar" type; if the \a Scalar type is already real numbers
    * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
    * \a Scalar is \a std::complex<T> then RealScalar is \a T.
@@ -127,6 +126,7 @@ class SparseMatrixBase : public EigenBase<Derived> {
    */
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
   /** \internal the return type of coeff()
    */
   typedef std::conditional_t<HasDirectAccess_, const Scalar&, Scalar> CoeffReturnType;
@@ -224,33 +224,84 @@ class SparseMatrixBase : public EigenBase<Derived> {
  public:
 #ifndef EIGEN_NO_IO
   friend std::ostream& operator<<(std::ostream& s, const SparseMatrixBase& m) {
-    typedef typename Derived::Nested Nested;
-    typedef internal::remove_all_t<Nested> NestedCleaned;
+    using Nested = typename Derived::Nested;
+    using NestedCleaned = typename internal::remove_all<Nested>::type;
 
     if (Flags & RowMajorBit) {
       Nested nm(m.derived());
       internal::evaluator<NestedCleaned> thisEval(nm);
+
+      // compute global width
+      std::size_t width = 0;
+      {
+        std::ostringstream ss0;
+        ss0.copyfmt(s);
+        ss0 << Scalar(0);
+        width = ss0.str().size();
+        for (Index row = 0; row < nm.outerSize(); ++row) {
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
+          }
+        }
+      }
+
       for (Index row = 0; row < nm.outerSize(); ++row) {
         Index col = 0;
         for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
-          for (; col < it.index(); ++col) s << "0 ";
+          for (; col < it.index(); ++col) {
+            s.width(width);
+            s << Scalar(0) << " ";
+          }
+          s.width(width);
           s << it.value() << " ";
           ++col;
         }
-        for (; col < m.cols(); ++col) s << "0 ";
+        for (; col < m.cols(); ++col) {
+          s.width(width);
+          s << Scalar(0) << " ";
+        }
         s << std::endl;
       }
     } else {
       Nested nm(m.derived());
       internal::evaluator<NestedCleaned> thisEval(nm);
       if (m.cols() == 1) {
+        // compute local width (single col)
+        std::size_t width = 0;
+        {
+          std::ostringstream ss0;
+          ss0.copyfmt(s);
+          ss0 << Scalar(0);
+          width = ss0.str().size();
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
+          }
+        }
+
         Index row = 0;
         for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
-          for (; row < it.index(); ++row) s << "0" << std::endl;
+          for (; row < it.index(); ++row) {
+            s.width(width);
+            s << Scalar(0) << std::endl;
+          }
+          s.width(width);
           s << it.value() << std::endl;
           ++row;
         }
-        for (; row < m.rows(); ++row) s << "0" << std::endl;
+        for (; row < m.rows(); ++row) {
+          s.width(width);
+          s << Scalar(0) << std::endl;
+        }
       } else {
         SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
         s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 3402bae..05b3de5 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -143,7 +143,7 @@ class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType,
     return *this = src.twistedBy(pnull);
   }
 
-  // Since we override the copy-assignment operator, we need to explicitly re-declare the copy-constructor
+  // Since we override the copy-assignment operator, we need to explicitly redeclare the copy-constructor
   EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView)
 
   template <typename SrcMatrixType, unsigned int SrcMode>
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index fac162e..d19a00d 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -90,9 +90,9 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
   inline StorageIndex* innerNonZeroPtr() { return 0; }
 
   /** \internal */
-  inline Storage& data() { return m_data; }
+  constexpr Storage& data() { return m_data; }
   /** \internal */
-  inline const Storage& data() const { return m_data; }
+  constexpr const Storage& data() const { return m_data; }
 
   inline Scalar coeff(Index row, Index col) const {
     eigen_assert(IsColVector ? (col == 0 && row >= 0 && row < m_size) : (row == 0 && col >= 0 && col < m_size));
@@ -109,7 +109,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
   }
 
   /** \returns a reference to the coefficient value at given index \a i
-   * This operation involes a log(rho*size) binary search. If the coefficient does not
+   * This operation involves a log(rho*size) binary search. If the coefficient does not
    * exist yet, then a sorted insertion into a sequential buffer is performed.
    *
    * This insertion might be very costly if the number of nonzeros above \a i is large.
@@ -278,6 +278,7 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
     std::swap(m_size, other.m_size);
     m_data.swap(other.m_data);
   }
+  friend EIGEN_DEVICE_FUNC void swap(SparseVector& a, SparseVector& b) { a.swap(b); }
 
   template <int OtherOptions>
   inline void swap(SparseMatrix<Scalar, OtherOptions, StorageIndex>& other) {
@@ -285,6 +286,14 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
     std::swap(m_size, other.m_innerSize);
     m_data.swap(other.m_data);
   }
+  template <int OtherOptions>
+  friend EIGEN_DEVICE_FUNC void swap(SparseVector& a, SparseMatrix<Scalar, OtherOptions, StorageIndex>& b) {
+    a.swap(b);
+  }
+  template <int OtherOptions>
+  friend EIGEN_DEVICE_FUNC void swap(SparseMatrix<Scalar, OtherOptions, StorageIndex>& a, SparseVector& b) {
+    b.swap(a);
+  }
 
   inline SparseVector& operator=(const SparseVector& other) {
     if (other.isRValue()) {
@@ -345,40 +354,40 @@ class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_,
 
  public:
   /** \internal \deprecated use setZero() and reserve() */
-  EIGEN_DEPRECATED void startFill(Index reserve) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .setZero() and .reserve() instead.") void startFill(Index reserve) {
     setZero();
     m_data.reserve(reserve);
   }
 
   /** \internal \deprecated use insertBack(Index,Index) */
-  EIGEN_DEPRECATED Scalar& fill(Index r, Index c) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index r, Index c) {
     eigen_assert(r == 0 || c == 0);
     return fill(IsColVector ? r : c);
   }
 
   /** \internal \deprecated use insertBack(Index) */
-  EIGEN_DEPRECATED Scalar& fill(Index i) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index i) {
     m_data.append(0, i);
     return m_data.value(m_data.size() - 1);
   }
 
   /** \internal \deprecated use insert(Index,Index) */
-  EIGEN_DEPRECATED Scalar& fillrand(Index r, Index c) {
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index r, Index c) {
     eigen_assert(r == 0 || c == 0);
     return fillrand(IsColVector ? r : c);
   }
 
   /** \internal \deprecated use insert(Index) */
-  EIGEN_DEPRECATED Scalar& fillrand(Index i) { return insert(i); }
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index i) { return insert(i); }
 
   /** \internal \deprecated use finalize() */
-  EIGEN_DEPRECATED void endFill() {}
+  EIGEN_DEPRECATED_WITH_REASON("Use .finalize() instead.") void endFill() {}
 
   // These two functions were here in the 3.1 release, so let's keep them in case some code rely on them.
   /** \internal \deprecated use data() */
-  EIGEN_DEPRECATED Storage& _data() { return m_data; }
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") Storage& _data() { return m_data; }
   /** \internal \deprecated use data() */
-  EIGEN_DEPRECATED const Storage& _data() const { return m_data; }
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") const Storage& _data() const { return m_data; }
 
 #ifdef EIGEN_SPARSEVECTOR_PLUGIN
 #include EIGEN_SPARSEVECTOR_PLUGIN
diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index 7753a24..684de48 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -41,7 +41,7 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, RowMajor> {
           lastVal = it.value();
           lastIndex = it.index();
           if (lastIndex == i) break;
-          tmp -= lastVal * other.coeff(lastIndex, col);
+          tmp = numext::madd<Scalar>(-lastVal, other.coeff(lastIndex, col), tmp);
         }
         if (Mode & UnitDiag)
           other.coeffRef(i, col) = tmp;
@@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, RowMajor> {
         } else if (it && it.index() == i)
           ++it;
         for (; it; ++it) {
-          tmp -= it.value() * other.coeff(it.index(), col);
+          tmp = numext::madd<Scalar>(-it.value(), other.coeff(it.index(), col), tmp);
         }
 
         if (Mode & UnitDiag)
@@ -107,7 +107,9 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, ColMajor> {
             tmp /= it.value();
           }
           if (it && it.index() == i) ++it;
-          for (; it; ++it) other.coeffRef(it.index(), col) -= tmp * it.value();
+          for (; it; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
+          }
         }
       }
     }
@@ -135,7 +137,9 @@ struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, ColMajor> {
             other.coeffRef(i, col) /= it.value();
           }
           LhsIterator it(lhsEval, i);
-          for (; it && it.index() < i; ++it) other.coeffRef(it.index(), col) -= tmp * it.value();
+          for (; it && it.index() < i; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
+          }
         }
       }
     }
@@ -215,9 +219,13 @@ struct sparse_solve_triangular_sparse_selector<Lhs, Rhs, Mode, UpLo, ColMajor> {
           tempVector.restart();
           if (IsLower) {
             if (it.index() == i) ++it;
-            for (; it; ++it) tempVector.coeffRef(it.index()) -= ci * it.value();
+            for (; it; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
           } else {
-            for (; it && it.index() < i; ++it) tempVector.coeffRef(it.index()) -= ci * it.value();
+            for (; it && it.index() < i; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
           }
         }
       }
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 29be01a..cc69a42 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -257,7 +257,7 @@ class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
   /** \brief Give the number of rows.
    */
   inline Index rows() const { return m_mat.rows(); }
-  /** \brief Give the numver of columns.
+  /** \brief Give the number of columns.
    */
   inline Index cols() const { return m_mat.cols(); }
   /** \brief Let you set that the pattern of the input matrix is symmetric
@@ -600,7 +600,7 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
  * This exit was 0 if successful factorization.
  * > 0 if info = i, and i is been completed, but the factor U is exactly singular,
  * and division by zero will occur if it is used to solve a system of equation.
- * > A->ncol: number of bytes allocated when memory allocation failure occured, plus A->ncol.
+ * > A->ncol: number of bytes allocated when memory allocation failure occurred, plus A->ncol.
  * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
  *
  * It seems that A was the name of the matrix in the past.
diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index 2afab01..85ba884 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -50,7 +50,7 @@
  *  values.
  *
  *  The last column structures (for pruning) will be removed
- *  after the numercial LU factorization phase.
+ *  after the numerical LU factorization phase.
  *
  *   (xlusup,lusup): lusup[*] contains the numerical values of the
  *  rectangular supernodes; xlusup[j] points to the starting
diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h
index e5fb771..71a9ff4 100644
--- a/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -30,15 +30,15 @@
 #ifndef SPARSELU_COLUMN_DFS_H
 #define SPARSELU_COLUMN_DFS_H
 
-template <typename Scalar, typename StorageIndex>
-class SparseLUImpl;
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
 namespace Eigen {
-
 namespace internal {
 
+template <typename Scalar, typename StorageIndex>
+class SparseLUImpl;
+
 template <typename IndexVector, typename ScalarVector>
 struct column_dfs_traits : no_assignment_operator {
   typedef typename ScalarVector::Scalar Scalar;
diff --git a/Eigen/src/SparseLU/SparseLU_pivotL.h b/Eigen/src/SparseLU/SparseLU_pivotL.h
index ada511e..10a090b 100644
--- a/Eigen/src/SparseLU/SparseLU_pivotL.h
+++ b/Eigen/src/SparseLU/SparseLU_pivotL.h
@@ -37,7 +37,7 @@ namespace Eigen {
 namespace internal {
 
 /**
- * \brief Performs the numerical pivotin on the current column of L, and the CDIV operation.
+ * \brief Performs the numerical pivoting on the current column of L, and the CDIV operation.
  *
  * Pivot policy :
  * (1) Compute thresh = u * max_(i>=j) abs(A_ij);
diff --git a/Eigen/src/SparseLU/SparseLU_pruneL.h b/Eigen/src/SparseLU/SparseLU_pruneL.h
index 4f51d59..620f285 100644
--- a/Eigen/src/SparseLU/SparseLU_pruneL.h
+++ b/Eigen/src/SparseLU/SparseLU_pruneL.h
@@ -101,7 +101,7 @@ void SparseLUImpl<Scalar, StorageIndex>::pruneL(const Index jcol, const IndexVec
             kmin++;
           else {
             // kmin below pivrow (not yet pivoted), and kmax
-            // above pivrow: interchange the two suscripts
+            // above pivrow: interchange the two subscripts
             std::swap(glu.lsub(kmin), glu.lsub(kmax));
 
             // If the supernode has only one column, then we
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index acb0c5f..4dc7aa9 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -73,7 +73,7 @@ struct traits<SparseQR_QProduct<SparseQRType, Derived> > {
  * detailed in the following paper:
  * <i>
  * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
- * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011.
+ * Sparse QR Factorization", ACM Trans. on Math. Soft. 38(1), 2011.
  * </i>
  * Even though it is qualified as "rank-revealing", this strategy might fail for some
  * rank deficient problems. When this class is used to solve linear or least-square problems
@@ -365,7 +365,6 @@ void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
   IndexVector Ridx(n), Qidx(m);  // Store temporarily the row indexes for the current column of R and Q
   Index nzcolR, nzcolQ;          // Number of nonzero for the current column of R and Q
   ScalarVector tval(m);          // The dense vector used to compute the current column
-  RealScalar pivotThreshold = m_threshold;
 
   m_R.setZero();
   m_Q.setZero();
@@ -401,11 +400,14 @@ void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
    * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
    * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
    */
+  RealScalar pivotThreshold;
   if (m_useDefaultThreshold) {
     RealScalar max2Norm = 0.0;
     for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
     if (max2Norm == RealScalar(0)) max2Norm = RealScalar(1);
     pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+  } else {
+    pivotThreshold = m_threshold;
   }
 
   // Initialize the numerical permutation
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 0c10149..158dadd 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -65,6 +65,24 @@ DECL_GSSVX(z, double, std::complex<double>)
 #ifdef EIGEN_SUPERLU_HAS_ILU
 
 // similarly for the incomplete factorization using gsisx
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
+#define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                         \
+  }                                                                                                                    \
+  inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, SuperLUStat_t *stats, int *info, KEYTYPE) {                             \
+    mem_usage_t mem_usage;                                                                                             \
+    GlobalLU_t gLU;                                                                                                    \
+    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  &gLU, &mem_usage, stats, info);                                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
+  }
+#else  // version < 5.0
 #define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
   extern "C" {                                                                                                         \
   extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
@@ -80,6 +98,7 @@ DECL_GSSVX(z, double, std::complex<double>)
                   &mem_usage, stats, info);                                                                            \
     return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
   }
+#endif
 
 DECL_GSISX(s, float, float)
 DECL_GSISX(c, float, std::complex<float>)
@@ -320,7 +339,7 @@ class SuperLUBase : public SparseSolverBase<Derived> {
     derived().factorize(matrix);
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -454,7 +473,7 @@ class SuperLU : public SuperLUBase<MatrixType_, SuperLU<MatrixType_> > {
 
   ~SuperLU() {}
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -468,7 +487,7 @@ class SuperLU : public SuperLUBase<MatrixType_, SuperLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
@@ -762,7 +781,7 @@ class SuperILU : public SuperLUBase<MatrixType_, SuperILU<MatrixType_> > {
 
   ~SuperILU() {}
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -772,7 +791,7 @@ class SuperILU : public SuperLUBase<MatrixType_, SuperILU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the symbolic decomposition has been performed.
    *
    * \sa analyzePattern()
    */
diff --git a/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
new file mode 100644
index 0000000..c603a38
--- /dev/null
+++ b/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
@@ -0,0 +1,336 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
+#define EIGEN_CORE_THREAD_POOL_DEVICE_H
+
+namespace Eigen {
+
+// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
+// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
+// delegating the task to a thread.
+/*
+                 a
+                / \
+               /   \
+              /     \
+             /       \
+            /         \
+           /           \
+          /             \
+         a               e
+        / \             / \
+       /   \           /   \
+      /     \         /     \
+     a       c       e       g
+    / \     / \     / \     / \
+   /   \   /   \   /   \   /   \
+  a     b c     d e     f g     h
+*/
+// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
+// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
+// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
+// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
+// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.
+
+struct CoreThreadPoolDevice {
+  using Task = std::function<void()>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
+      : m_pool(pool) {
+    eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
+    m_costFactor = threadCostThreshold;
+  }
+
+  template <int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
+    eigen_assert(cost >= 0.0f && "cost must be non-negative");
+    Index numOps = size / PacketSize;
+    int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
+    float totalCost = static_cast<float>(numOps) * cost;
+    float idealThreads = totalCost * m_costFactor;
+    if (idealThreads < static_cast<float>(actualThreads)) {
+      idealThreads = numext::maxi(idealThreads, 1.0f);
+      actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
+    }
+    int maxLevel = internal::log2_ceil(actualThreads);
+    return maxLevel;
+  }
+
+// MSVC does not like inlining parallelForImpl
+#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
+#define EIGEN_PARALLEL_FOR_INLINE
+#else
+#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
+#endif
+
+  template <typename UnaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
+                                                                   Barrier& barrier, int level) {
+    while (level > 0) {
+      level--;
+      Index size = end - begin;
+      eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
+      Index mid = begin + numext::round_down(size >> 1, PacketSize);
+      Task right = [this, mid, end, &f, &barrier, level]() {
+        parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
+      };
+      m_pool.Schedule(std::move(right));
+      end = mid;
+    }
+    for (Index i = begin; i < end; i += PacketSize) f(i);
+    barrier.Notify();
+  }
+
+  template <typename BinaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
+                                                                   Index innerEnd, BinaryFunctor& f, Barrier& barrier,
+                                                                   int level) {
+    while (level > 0) {
+      level--;
+      Index outerSize = outerEnd - outerBegin;
+      if (outerSize > 1) {
+        Index outerMid = outerBegin + (outerSize >> 1);
+        Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
+          parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
+        };
+        m_pool.Schedule(std::move(right));
+        outerEnd = outerMid;
+      } else {
+        Index innerSize = innerEnd - innerBegin;
+        eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
+        Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
+        Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
+          parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
+        };
+        m_pool.Schedule(std::move(right));
+        innerEnd = innerMid;
+      }
+    }
+    for (Index outer = outerBegin; outer < outerEnd; outer++)
+      for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
+    barrier.Notify();
+  }
+
+#undef EIGEN_PARALLEL_FOR_INLINE
+
+  template <typename UnaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
+    Index size = end - begin;
+    int maxLevel = calculateLevels<PacketSize>(size, cost);
+    Barrier barrier(1 << maxLevel);
+    parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
+    barrier.Wait();
+  }
+
+  template <typename BinaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
+                                                         Index innerEnd, BinaryFunctor& f, float cost) {
+    Index outerSize = outerEnd - outerBegin;
+    Index innerSize = innerEnd - innerBegin;
+    Index size = outerSize * innerSize;
+    int maxLevel = calculateLevels<PacketSize>(size, cost);
+    Barrier barrier(1 << maxLevel);
+    parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
+    barrier.Wait();
+  }
+
+  ThreadPool& m_pool;
+  // costFactor is the cost of delegating a task to a thread
+  // the inverse is used to avoid a floating point division
+  float m_costFactor;
+};
+
+// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice
+
+namespace internal {
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+struct Kernel;
+#endif
+
+template <typename Kernel>
+struct cost_helper {
+  using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
+  using DstEvaluatorType = typename Kernel::DstEvaluatorType;
+  using SrcXprType = typename SrcEvaluatorType::XprType;
+  using DstXprType = typename DstEvaluatorType::XprType;
+  static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->assignCoeffByOuterInner(outer, inner);
+    }
+  };
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
+  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    AssignmentFunctor functor(kernel);
+    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
+    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
+                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+                         DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
+                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+                         DstAlignment = Kernel::AssignmentTraits::DstAlignment,
+                         InnerSize = DstXprType::InnerSizeAtCompileTime;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
+  struct PacketAssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
+    }
+  };
+  struct ScalarAssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      const Index innerSize = this->innerSize();
+      const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
+      for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    const Index innerSize = kernel.innerSize();
+    const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
+    constexpr float packetCost = static_cast<float>(XprEvaluationCost);
+    const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
+    PacketAssignmentFunctor packetFunctor(kernel);
+    ScalarAssignmentFunctor scalarFunctor(kernel);
+    device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
+                                                                     packetCost);
+    device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
+  };
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index size = kernel.size();
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
+                         RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
+                         PacketSize = unpacket_traits<PacketType>::size,
+                         DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
+                         DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
+                                                                               : Kernel::AssignmentTraits::DstAlignment,
+                         SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
+      this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
+    }
+  };
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index size = kernel.size();
+    const Index alignedStart =
+        DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
+
+    head_loop::run(kernel, 0, alignedStart);
+
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
+
+    tail_loop::run(kernel, alignedEnd, size);
+  }
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H
diff --git a/Eigen/src/ThreadPool/EventCount.h b/Eigen/src/ThreadPool/EventCount.h
index 0117b4b..6eda6f4 100644
--- a/Eigen/src/ThreadPool/EventCount.h
+++ b/Eigen/src/ThreadPool/EventCount.h
@@ -57,6 +57,9 @@ class EventCount {
     eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
   }
 
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+
   ~EventCount() {
     // Ensure there are no waiters.
     eigen_plain_assert(state_.load() == kStackMask);
@@ -155,22 +158,6 @@ class EventCount {
     }
   }
 
-  class Waiter {
-    friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter
-    // objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
-    EIGEN_MUTEX mu;
-    EIGEN_CONDVAR cv;
-    uint64_t epoch = 0;
-    unsigned state = kNotSignaled;
-    enum {
-      kNotSignaled,
-      kWaiting,
-      kSignaled,
-    };
-  };
-
  private:
   // State_ layout:
   // - low kWaiterBits is a stack of waiters committed wait
@@ -192,9 +179,25 @@ class EventCount {
   static const uint64_t kEpochBits = 64 - kEpochShift;
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
-  std::atomic<uint64_t> state_;
-  MaxSizeVector<Waiter>& waiters_;
 
+ public:
+  class Waiter {
+    friend class EventCount;
+
+    enum State {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+
+    EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
+    EIGEN_MUTEX mu;
+    EIGEN_CONDVAR cv;
+    uint64_t epoch{0};
+    unsigned state{kNotSignaled};
+  };
+
+ private:
   static void CheckState(uint64_t state, bool waiter = false) {
     static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
     const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
@@ -229,8 +232,8 @@ class EventCount {
     }
   }
 
-  EventCount(const EventCount&) = delete;
-  void operator=(const EventCount&) = delete;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
 };
 
 }  // namespace Eigen
diff --git a/Eigen/src/ThreadPool/ForkJoin.h b/Eigen/src/ThreadPool/ForkJoin.h
new file mode 100644
index 0000000..588636a
--- /dev/null
+++ b/Eigen/src/ThreadPool/ForkJoin.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Weiwei Kong <weiweikong@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_THREADPOOL_FORKJOIN_H
+#define EIGEN_THREADPOOL_FORKJOIN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// ForkJoinScheduler provides implementations of various non-blocking ParallelFor algorithms for unary
+// and binary parallel tasks. More specifically, the implementations follow the binary tree-based
+// algorithm from the following paper:
+//
+//   Lea, D. (2000, June). A java fork/join framework. *In Proceedings of the
+//   ACM 2000 conference on Java Grande* (pp. 36-43).
+//
+// For a given binary task function `f(i,j)` and integers `num_threads`, `granularity`, `start`, and `end`,
+// the implemented parallel for algorithm schedules and executes at most `num_threads` of the functions
+// from the following set in parallel (either synchronously or asynchronously):
+//
+//   f(start,start+s_1), f(start+s_1,start+s_2), ..., f(start+s_n,end)
+//
+// where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
+// task function `g(k)`, the same operation is applied with
+//
+//   f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
+//
+// Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
+// given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
+// tuned `granularity` is in `Eigen::CoreThreadPoolDevice::parallelFor(...)` where the template
+// parameter `PacketSize` and float input `cost` are used to indirectly compute a granularity level for a
+// given task function.
+//
+// Example usage #1 (synchronous):
+// ```
+// ThreadPool thread_pool(num_threads);
+// ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
+// ```
+//
+// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
+// ```
+// ThreadPool thread_pool(num_threads);
+// Barrier barrier(num_async_calls);
+// auto done = [&](){ barrier.Notify(); };
+// for (Index k=0; k<num_async_calls; ++k) {
+//   ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
+//   &thread_pool);
+// }
+// barrier.Wait();
+// ```
+class ForkJoinScheduler {
+ public:
+  // Runs `do_func` asynchronously for the range [start, end) with a specified
+  // granularity. `do_func` should be of type `std::function<void(Index,
+  // Index)`. `done()` is called exactly once after all tasks have been executed.
+  template <typename DoFnType, typename DoneFnType, typename ThreadPoolEnv>
+  static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
+                               ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    if (start >= end) {
+      done();
+      return;
+    }
+    thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
+                           done = std::forward<DoneFnType>(done)]() {
+      RunParallelFor(start, end, granularity, do_func, thread_pool);
+      done();
+    });
+  }
+
+  // Synchronous variant of ParallelForAsync.
+  // WARNING: Making nested calls to `ParallelFor`, e.g., calling `ParallelFor` inside a task passed into another
+  // `ParallelFor` call, may lead to deadlocks due to how task stealing is implemented.
+  template <typename DoFnType, typename ThreadPoolEnv>
+  static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func,
+                          ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    if (start >= end) return;
+    Barrier barrier(1);
+    auto done = [&barrier]() { barrier.Notify(); };
+    ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
+    barrier.Wait();
+  }
+
+ private:
+  // Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
+  template <typename LeftType, typename RightType, typename ThreadPoolEnv>
+  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    typedef typename ThreadPoolTempl<ThreadPoolEnv>::Task Task;
+    std::atomic<bool> right_done(false);
+    auto execute_right = [&right_thunk, &right_done]() {
+      std::forward<RightType>(right_thunk)();
+      right_done.store(true, std::memory_order_release);
+    };
+    thread_pool->Schedule(execute_right);
+    std::forward<LeftType>(left_thunk)();
+    Task task;
+    while (!right_done.load(std::memory_order_acquire)) {
+      thread_pool->MaybeGetTask(&task);
+      if (task.f) task.f();
+    }
+  }
+
+  static Index ComputeMidpoint(Index start, Index end, Index granularity) {
+    // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
+    // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
+    // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
+    // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
+    // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
+    const Index size = end - start;
+    const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
+    return start + offset;
+  }
+
+  template <typename DoFnType, typename ThreadPoolEnv>
+  static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func,
+                             ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    Index mid = ComputeMidpoint(start, end, granularity);
+    if ((end - start) < granularity || mid == start || mid == end) {
+      do_func(start, end);
+      return;
+    }
+    ForkJoin([start, mid, granularity, &do_func,
+              thread_pool]() { RunParallelFor(start, mid, granularity, do_func, thread_pool); },
+             [mid, end, granularity, &do_func, thread_pool]() {
+               RunParallelFor(mid, end, granularity, do_func, thread_pool);
+             },
+             thread_pool);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_THREADPOOL_FORKJOIN_H
diff --git a/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
index efa6ef5..44d4b24 100644
--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -18,21 +18,41 @@ namespace Eigen {
 template <typename Environment>
 class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  public:
+  typedef typename Environment::EnvThread Thread;
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
+  };
+
   ThreadPoolTempl(int num_threads, Environment env = Environment()) : ThreadPoolTempl(num_threads, true, env) {}
 
   ThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment())
       : env_(env),
         num_threads_(num_threads),
         allow_spinning_(allow_spinning),
+        spin_count_(
+            // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is proportional to num_threads_ and
+            // we assume that new work is scheduled at a constant rate, so we divide `kSpintCount` by number of
+            // threads and number of spinning threads. The constant was picked based on a fair dice roll, tune it.
+            allow_spinning && num_threads > 0 ? kSpinCount / kMaxSpinningThreads / num_threads : 0),
         thread_data_(num_threads),
         all_coprimes_(num_threads),
         waiters_(num_threads),
         global_steal_partition_(EncodePartition(0, num_threads_)),
+        spinning_state_(0),
         blocked_(0),
-        spinning_(0),
         done_(false),
         cancelled_(false),
         ec_(waiters_) {
@@ -125,12 +145,51 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
     if (!t.f) {
-      ec_.Notify(false);
+      if (IsNotifyParkedThreadRequired()) {
+        ec_.Notify(false);
+      }
     } else {
       env_.ExecuteTask(t);  // Push failed, execute directly.
     }
   }
 
+  // Tries to assign work to the current task.
+  void MaybeGetTask(Task* t) {
+    PerThread* pt = GetPerThread();
+    const int thread_id = pt->thread_id;
+    // If we are not a worker thread of this pool, we can't get any work.
+    if (thread_id < 0) return;
+    Queue& q = thread_data_[thread_id].queue;
+    *t = q.PopFront();
+    if (t->f) return;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads single thread pools
+      // tend to be used for.
+      for (int i = 0; i < spin_count_ && !t->f; ++i) *t = q.PopFront();
+    } else {
+      if (EIGEN_PREDICT_FALSE(!t->f)) *t = LocalSteal();
+      if (EIGEN_PREDICT_FALSE(!t->f)) *t = GlobalSteal();
+      if (EIGEN_PREDICT_FALSE(!t->f)) {
+        if (allow_spinning_ && StartSpinning()) {
+          for (int i = 0; i < spin_count_ && !t->f; ++i) *t = GlobalSteal();
+          // Notify `spinning_state_` that we are no longer spinning.
+          bool has_no_notify_task = StopSpinning();
+          // If a task was submitted to the queue without a call to
+          // `ec_.Notify()` (if `IsNotifyParkedThreadRequired()` returned
+          // false), and we didn't steal anything above, we must try to
+          // steal one more time, to make sure that this task will be
+          // executed. We will not necessarily find it, because it might
+          // have been already stolen by some other thread.
+          if (has_no_notify_task && !t->f) *t = GlobalSteal();
+        }
+      }
+    }
+  }
+
   void Cancel() EIGEN_OVERRIDE {
     cancelled_ = true;
     done_ = true;
@@ -165,8 +224,8 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
   // Exposed publicly as static functions so that external callers can reuse
   // this encode/decode logic for maintaining their own thread-safe copies of
   // scheduling and steal domain(s).
-  static const int kMaxPartitionBits = 16;
-  static const int kMaxThreads = 1 << kMaxPartitionBits;
+  static constexpr int kMaxPartitionBits = 16;
+  static constexpr int kMaxThreads = 1 << kMaxPartitionBits;
 
   inline unsigned EncodePartition(unsigned start, unsigned limit) { return (start << kMaxPartitionBits) | limit; }
 
@@ -204,35 +263,68 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     }
   }
 
-  typedef typename Environment::EnvThread Thread;
-
-  struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
-    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;          // Random generator state.
-    int thread_id;          // Worker thread index in pool.
-#ifndef EIGEN_THREAD_LOCAL
-    // Prevent false sharing.
-    char pad_[128];
-#endif
-  };
+  // Maximum number of threads that can spin in steal loop.
+  static constexpr int kMaxSpinningThreads = 1;
+
+  // The number of steal loop spin iterations before parking (this number is
+  // divided by the number of threads, to get spin count for each thread).
+  static constexpr int kSpinCount = 5000;
+
+  // If there are enough active threads with empty pending-task queues, a thread
+  // that runs out of work can just be parked without spinning, because these
+  // active threads will go into a steal loop after finishing their current
+  // tasks.
+  //
+  // In the worst case when all active threads are executing long/expensive
+  // tasks, the next Schedule() will have to wait until one of the parked
+  // threads will be unparked, however this should be very rare in practice.
+  static constexpr int kMinActiveThreadsToStartSpinning = 4;
+
+  struct SpinningState {
+    // Spinning state layout:
+    //
+    // - Low 32 bits encode the number of threads that are spinning in steal
+    //   loop.
+    //
+    // - High 32 bits encode the number of tasks that were submitted to the pool
+    //   without a call to `ec_.Notify()`. This number can't be larger than
+    //   the number of spinning threads. Each spinning thread, when it exits the
+    //   spin loop must check if this number is greater than zero, and maybe
+    //   make another attempt to steal a task and decrement it by one.
+    static constexpr uint64_t kNumSpinningMask = 0x00000000FFFFFFFF;
+    static constexpr uint64_t kNumNoNotifyMask = 0xFFFFFFFF00000000;
+    static constexpr uint64_t kNumNoNotifyShift = 32;
+
+    uint64_t num_spinning;         // number of spinning threads
+    uint64_t num_no_notification;  // number of tasks submitted without
+                                   // notifying waiting threads
+
+    // Decodes `spinning_state_` value.
+    static SpinningState Decode(uint64_t state) {
+      uint64_t num_spinning = (state & kNumSpinningMask);
+      uint64_t num_no_notification = (state & kNumNoNotifyMask) >> kNumNoNotifyShift;
+
+      eigen_plain_assert(num_no_notification <= num_spinning);
+      return {num_spinning, num_no_notification};
+    }
 
-  struct ThreadData {
-    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
-    std::unique_ptr<Thread> thread;
-    std::atomic<unsigned> steal_partition;
-    Queue queue;
+    // Encodes as `spinning_state_` value.
+    uint64_t Encode() const {
+      eigen_plain_assert(num_no_notification <= num_spinning);
+      return (num_no_notification << kNumNoNotifyShift) | num_spinning;
+    }
   };
 
   Environment env_;
   const int num_threads_;
   const bool allow_spinning_;
+  const int spin_count_;
   MaxSizeVector<ThreadData> thread_data_;
   MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
   MaxSizeVector<EventCount::Waiter> waiters_;
   unsigned global_steal_partition_;
+  std::atomic<uint64_t> spinning_state_;
   std::atomic<unsigned> blocked_;
-  std::atomic<bool> spinning_;
   std::atomic<bool> done_;
   std::atomic<bool> cancelled_;
   EventCount ec_;
@@ -242,6 +334,9 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
   std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
 #endif
 
+  unsigned NumBlockedThreads() const { return blocked_.load(); }
+  unsigned NumActiveThreads() const { return num_threads_ - blocked_.load(); }
+
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
 #ifndef EIGEN_THREAD_LOCAL
@@ -258,67 +353,16 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     pt->pool = this;
     pt->rand = GlobalThreadIdHash();
     pt->thread_id = thread_id;
-    Queue& q = thread_data_[thread_id].queue;
-    EventCount::Waiter* waiter = &waiters_[thread_id];
-    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
-    // proportional to num_threads_ and we assume that new work is scheduled at
-    // a constant rate, so we set spin_count to 5000 / num_threads_. The
-    // constant was picked based on a fair dice roll, tune it.
-    const int spin_count = allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
-    if (num_threads_ == 1) {
-      // For num_threads_ == 1 there is no point in going through the expensive
-      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
-      // victim queues it might reverse the order in which ops are executed
-      // compared to the order in which they are scheduled, which tends to be
-      // counter-productive for the types of I/O workloads the single thread
-      // pools tend to be used for.
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        for (int i = 0; i < spin_count && !t.f; i++) {
-          if (!cancelled_.load(std::memory_order_relaxed)) {
-            t = q.PopFront();
-          }
-        }
-        if (!t.f) {
-          if (!WaitForWork(waiter, &t)) {
-            return;
-          }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
-      }
-    } else {
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        if (!t.f) {
-          t = LocalSteal();
-          if (!t.f) {
-            t = GlobalSteal();
-            if (!t.f) {
-              // Leave one thread spinning. This reduces latency.
-              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
-                for (int i = 0; i < spin_count && !t.f; i++) {
-                  if (!cancelled_.load(std::memory_order_relaxed)) {
-                    t = GlobalSteal();
-                  } else {
-                    return;
-                  }
-                }
-                spinning_ = false;
-              }
-              if (!t.f) {
-                if (!WaitForWork(waiter, &t)) {
-                  return;
-                }
-              }
-            }
-          }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
+    Task t;
+    while (!cancelled_.load(std::memory_order_relaxed)) {
+      MaybeGetTask(&t);
+      // If we still don't have a task, wait for one. Return if thread pool is
+      // in cancelled state.
+      if (EIGEN_PREDICT_FALSE(!t.f)) {
+        EventCount::Waiter* waiter = &waiters_[pt->thread_id];
+        if (!WaitForWork(waiter, &t)) return;
       }
+      if (EIGEN_PREDICT_TRUE(t.f)) env_.ExecuteTask(t);
     }
   }
 
@@ -343,7 +387,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
       }
       victim += inc;
       if (victim >= size) {
-        victim -= size;
+        victim -= static_cast<unsigned int>(size);
       }
     }
     return Task();
@@ -431,12 +475,82 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
       }
       victim += inc;
       if (victim >= size) {
-        victim -= size;
+        victim -= static_cast<unsigned int>(size);
       }
     }
     return -1;
   }
 
+  // StartSpinning() checks if the number of threads in the spin loop is less
+  // than the allowed maximum. If so, increments the number of spinning threads
+  // by one and returns true (caller must enter the spin loop). Otherwise
+  // returns false, and the caller must not enter the spin loop.
+  bool StartSpinning() {
+    if (NumActiveThreads() > kMinActiveThreadsToStartSpinning) return false;
+
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      if ((state.num_spinning - state.num_no_notification) >= kMaxSpinningThreads) {
+        return false;
+      }
+
+      // Increment the number of spinning threads.
+      ++state.num_spinning;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return true;
+      }
+    }
+  }
+
+  // StopSpinning() decrements the number of spinning threads by one. It also
+  // checks if there were any tasks submitted into the pool without notifying
+  // parked threads, and decrements the count by one. Returns true if the number
+  // of tasks submitted without notification was decremented. In this case,
+  // caller thread might have to call Steal() one more time.
+  bool StopSpinning() {
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      // Decrement the number of spinning threads.
+      --state.num_spinning;
+
+      // Maybe decrement the number of tasks submitted without notification.
+      bool has_no_notify_task = state.num_no_notification > 0;
+      if (has_no_notify_task) --state.num_no_notification;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return has_no_notify_task;
+      }
+    }
+  }
+
+  // IsNotifyParkedThreadRequired() returns true if parked thread must be
+  // notified about new added task. If there are threads spinning in the steal
+  // loop, there is no need to unpark any of the waiting threads, the task will
+  // be picked up by one of the spinning threads.
+  bool IsNotifyParkedThreadRequired() {
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      // If the number of tasks submitted without notifying parked threads is
+      // equal to the number of spinning threads, we must wake up one of the
+      // parked threads.
+      if (state.num_no_notification == state.num_spinning) return true;
+
+      // Increment the number of tasks submitted without notification.
+      ++state.num_no_notification;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return false;
+      }
+    }
+  }
+
   static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
     return std::hash<std::thread::id>()(std::this_thread::get_id());
   }
diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h
index 9f40e9d..9046b18 100644
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h
@@ -154,16 +154,18 @@ class RunQueue {
  private:
   static const unsigned kMask = kSize - 1;
   static const unsigned kMask2 = (kSize << 1) - 1;
-  struct Elem {
-    std::atomic<uint8_t> state;
-    Work w;
-  };
-  enum {
+
+  enum State {
     kEmpty,
     kBusy,
     kReady,
   };
-  EIGEN_MUTEX mutex_;
+
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
   // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
@@ -171,9 +173,11 @@ class RunQueue {
   // position, these conditions would be indistinguishable); (2) obtain
   // consistent snapshot of front_/back_ for Size operation using the
   // modification counters.
-  std::atomic<unsigned> front_;
-  std::atomic<unsigned> back_;
-  Elem array_[kSize];
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+  EIGEN_MUTEX mutex_;  // guards `PushBack` and `PopBack` (accesses `back_`)
+
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
 
   // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
   // only whether the size is 0 is guaranteed to be correct.
@@ -208,12 +212,12 @@ class RunQueue {
   EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
     int size = (front & kMask2) - (back & kMask2);
     // Fix overflow.
-    if (size < 0) size += 2 * kSize;
+    if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
     // Order of modification in push/pop is crafted to make the queue look
     // larger than it is during concurrent modifications. E.g. push can
     // increment size before the corresponding pop has decremented it.
     // So the computed size can be up to kSize + 1, fix it.
-    if (size > static_cast<int>(kSize)) size = kSize;
+    if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
     return static_cast<unsigned>(size);
   }
 
diff --git a/Eigen/src/ThreadPool/ThreadLocal.h b/Eigen/src/ThreadPool/ThreadLocal.h
index 71df401..aa0bd10 100644
--- a/Eigen/src/ThreadPool/ThreadLocal.h
+++ b/Eigen/src/ThreadPool/ThreadLocal.h
@@ -43,12 +43,7 @@
 // This is primarily because of linker problems and toolchain misconfiguration:
 // TLS isn't supported until NDK r12b per
 // https://developer.android.com/ndk/downloads/revision_history.html
-// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
-// <android/ndk-version.h>. For NDK < r16, users should define these macros,
-// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
-#if __has_include(<android/ndk-version.h>)
-#include <android/ndk-version.h>
-#endif  // __has_include(<android/ndk-version.h>)
+
 #if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && defined(__NDK_MINOR__) && \
     ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
 #undef EIGEN_THREAD_LOCAL
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 3fdcc1f..1df8493 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -381,7 +381,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<MatrixType_> > {
     factorize_impl();
   }
 
-  /** Performs a symbolic decomposition on the sparcity of \a matrix.
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
    *
    * This function is particularly useful when solving for several problems having the same structure.
    *
@@ -425,7 +425,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<MatrixType_> > {
 
   /** Performs a numeric decomposition of \a matrix
    *
-   * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+   * The given matrix must has the same sparsity than the matrix on which the pattern anylysis has been performed.
    *
    * \sa analyzePattern(), compute()
    */
diff --git a/Eigen/src/misc/lapacke_helpers.h b/Eigen/src/misc/lapacke_helpers.h
index 5a2f38f..ff98639 100644
--- a/Eigen/src/misc/lapacke_helpers.h
+++ b/Eigen/src/misc/lapacke_helpers.h
@@ -75,7 +75,7 @@ EIGEN_ALWAYS_INLINE lapack_int to_lapack(Index index) { return convert_index<lap
 
 /// translates storage order of the given Eigen object to the corresponding lapack constant
 template <typename Derived>
-EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR lapack_int lapack_storage_of(const EigenBase<Derived> &) {
+EIGEN_ALWAYS_INLINE constexpr lapack_int lapack_storage_of(const EigenBase<Derived> &) {
   return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
 }
 
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
index 10c7a3e..c8c2434 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
@@ -96,9 +96,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
 }
 
 /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
- *
- * Example: \include Cwise_absolute_difference.cpp
- * Output: \verbinclude Cwise_absolute_difference.out
  *
  * \sa absolute_difference()
  */
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
index d03edc2..753aeb4 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
@@ -1,5 +1,3 @@
-
-
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CArgReturnType;
@@ -13,6 +11,7 @@ typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> Boo
 typedef CwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived> BitwiseNotReturnType;
 
 typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_exp2_op<Scalar>, const Derived> Exp2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
 typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
 typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
@@ -37,9 +36,11 @@ typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundRetu
 typedef CwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> RintReturnType;
 typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
 typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
+typedef CwiseUnaryOp<internal::scalar_trunc_op<Scalar>, const Derived> TruncReturnType;
 typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
 typedef CwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> IsInfReturnType;
 typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFiniteReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar, true>, const Derived> IsFiniteTypedReturnType;
 
 /** \returns an expression of the coefficient-wise absolute value of \c *this
  *
@@ -78,10 +79,20 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return
  * Example: \include Cwise_exp.cpp
  * Output: \verbinclude Cwise_exp.out
  *
- * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, pow(), log(), sin(), cos()
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp2(), pow(), log(), sin(),
+ * cos()
  */
 EIGEN_DEVICE_FUNC inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
 
+/** \returns an expression of the coefficient-wise exponential of *this.
+ *
+ * This function computes the coefficient-wise base2 exponential, i.e. 2^x.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp(), pow(), log(), sin(),
+ * cos()
+ */
+EIGEN_DEVICE_FUNC inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
+
 /** \returns an expression of the coefficient-wise exponential of *this minus 1.
  *
  * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1,
@@ -347,6 +358,15 @@ EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { return FloorRetur
  */
 EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
 
+/** \returns an expression of the coefficient-wise truncation of *this.
+ *
+ * Example: \include Cwise_trunc.cpp
+ * Output: \verbinclude Cwise_trunc.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_trunc">Math functions</a>, floor(), round()
+ */
+EIGEN_DEVICE_FUNC inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
+
 template <int N>
 struct ShiftRightXpr {
   typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
@@ -407,6 +427,9 @@ EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { return IsInfRetur
  * \sa isnan(), isinf()
  */
 EIGEN_DEVICE_FUNC inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
+EIGEN_DEVICE_FUNC inline const IsFiniteTypedReturnType isFiniteTyped() const {
+  return IsFiniteTypedReturnType(derived());
+}
 
 /** \returns an expression of the coefficient-wise ! operator of *this
  *
@@ -500,15 +523,11 @@ using UnaryPowReturnType =
     std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
                      CwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>;
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-template <typename ScalarExponent>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
-    const ScalarExponent& exponent) const {
-  return UnaryPowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
-#else
-/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent
+/** \returns an expression of the coefficients of \c *this raised to the constant power \a exponent
  *
- * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression.
+ * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type
+ *         of the given expression.
+ * \param exponent the scalar exponent value.
  *
  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
  * unsupported module MatrixFunctions computes the matrix power.
@@ -520,6 +539,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> p
  */
 template <typename ScalarExponent>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
-    const ScalarExponent& exponent) const;
-#endif
+    const ScalarExponent& exponent) const {
+  return UnaryPowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
 }
diff --git a/Eigen/src/plugins/BlockMethods.inc b/Eigen/src/plugins/BlockMethods.inc
index 122a2f4..0782aa3 100644
--- a/Eigen/src/plugins/BlockMethods.inc
+++ b/Eigen/src/plugins/BlockMethods.inc
@@ -1304,14 +1304,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::T
 template <int N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
+  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
 
 /// This is the const version of tail<int>.
 template <int N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n, n);
 }
 
 /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
@@ -1365,6 +1365,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical,
  * \sa subVector(Index)
  */
 template <DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index subVectors() const {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index subVectors() const {
   return (Direction == Vertical) ? cols() : rows();
 }
diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.inc b/Eigen/src/plugins/CommonCwiseBinaryOps.inc
index 95f338a..f1ba301 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.inc
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.inc
@@ -43,34 +43,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_MAKE_SCALAR_BINARY_OP(operator*, product)
-#else
 /** \returns an expression of \c *this scaled by the scalar factor \a scalar
  *
  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
  */
-template <typename T>
-const CwiseBinaryOp<internal::scalar_product_op<Scalar, T>, Derived, Constant<T> > operator*(const T& scalar) const;
-/** \returns an expression of \a expr scaled by the scalar factor \a scalar
- *
- * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
- */
-template <typename T>
-friend const CwiseBinaryOp<internal::scalar_product_op<T, Scalar>, Constant<T>, Derived> operator*(
-    const T& scalar, const StorageBaseType& expr);
-#endif
+EIGEN_MAKE_SCALAR_BINARY_OP(operator*, product)
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/, quotient)
-#else
 /** \returns an expression of \c *this divided by the scalar value \a scalar
  *
  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
  */
-template <typename T>
-const CwiseBinaryOp<internal::scalar_quotient_op<Scalar, T>, Derived, Constant<T> > operator/(const T& scalar) const;
-#endif
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/, quotient)
 
 /** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
  *
diff --git a/Eigen/src/plugins/IndexedViewMethods.inc b/Eigen/src/plugins/IndexedViewMethods.inc
index 26e7b5f..a51e349 100644
--- a/Eigen/src/plugins/IndexedViewMethods.inc
+++ b/Eigen/src/plugins/IndexedViewMethods.inc
@@ -9,172 +9,7 @@
 
 #if !defined(EIGEN_PARSED_BY_DOXYGEN)
 
-protected:
-// define some aliases to ease readability
-
-template <typename Indices>
-using IvcRowType = typename internal::IndexedViewCompatibleType<Indices, RowsAtCompileTime>::type;
-
-template <typename Indices>
-using IvcColType = typename internal::IndexedViewCompatibleType<Indices, ColsAtCompileTime>::type;
-
-template <typename Indices>
-using IvcType = typename internal::IndexedViewCompatibleType<Indices, SizeAtCompileTime>::type;
-
-typedef typename internal::IndexedViewCompatibleType<Index, 1>::type IvcIndex;
-
-template <typename Indices>
-inline IvcRowType<Indices> ivcRow(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(
-      indices, internal::variable_if_dynamic<Index, RowsAtCompileTime>(derived().rows()), Specialized);
-}
-
-template <typename Indices>
-inline IvcColType<Indices> ivcCol(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(
-      indices, internal::variable_if_dynamic<Index, ColsAtCompileTime>(derived().cols()), Specialized);
-}
-
-template <typename Indices>
-inline IvcType<Indices> ivcSize(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(
-      indices, internal::variable_if_dynamic<Index, SizeAtCompileTime>(derived().size()), Specialized);
-}
-
-// this helper class assumes internal::valid_indexed_view_overload<RowIndices, ColIndices>::value == true
-template <typename RowIndices, typename ColIndices,
-          bool UseSymbolic =
-              internal::traits<IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>>::ReturnAsScalar,
-          bool UseBlock =
-              internal::traits<IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>>::ReturnAsBlock,
-          bool UseGeneric = internal::traits<
-              IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>>::ReturnAsIndexedView>
-struct IndexedViewSelector;
-
-// Generic
-template <typename RowIndices, typename ColIndices>
-struct IndexedViewSelector<RowIndices, ColIndices, false, false, true> {
-  using ReturnType = IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
-  using ConstReturnType = IndexedView<const Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
-
-  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
-    return ReturnType(derived, derived.ivcRow(rowIndices), derived.ivcCol(colIndices));
-  }
-  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
-                                    const ColIndices& colIndices) {
-    return ConstReturnType(derived, derived.ivcRow(rowIndices), derived.ivcCol(colIndices));
-  }
-};
-
-// Block
-template <typename RowIndices, typename ColIndices>
-struct IndexedViewSelector<RowIndices, ColIndices, false, true, false> {
-  using IndexedViewType = IndexedView<Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
-  using ConstIndexedViewType = IndexedView<const Derived, IvcRowType<RowIndices>, IvcColType<ColIndices>>;
-  using ReturnType = typename internal::traits<IndexedViewType>::BlockType;
-  using ConstReturnType = typename internal::traits<ConstIndexedViewType>::BlockType;
-
-  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
-    IvcRowType<RowIndices> actualRowIndices = derived.ivcRow(rowIndices);
-    IvcColType<ColIndices> actualColIndices = derived.ivcCol(colIndices);
-    return ReturnType(derived, internal::first(actualRowIndices), internal::first(actualColIndices),
-                      internal::index_list_size(actualRowIndices), internal::index_list_size(actualColIndices));
-  }
-  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
-                                    const ColIndices& colIndices) {
-    IvcRowType<RowIndices> actualRowIndices = derived.ivcRow(rowIndices);
-    IvcColType<ColIndices> actualColIndices = derived.ivcCol(colIndices);
-    return ConstReturnType(derived, internal::first(actualRowIndices), internal::first(actualColIndices),
-                           internal::index_list_size(actualRowIndices), internal::index_list_size(actualColIndices));
-  }
-};
-
-// Symbolic
-template <typename RowIndices, typename ColIndices>
-struct IndexedViewSelector<RowIndices, ColIndices, true, false, false> {
-  using ReturnType = typename DenseBase<Derived>::Scalar&;
-  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
-
-  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
-    return derived(internal::eval_expr_given_size(rowIndices, derived.rows()),
-                   internal::eval_expr_given_size(colIndices, derived.cols()));
-  }
-  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
-                                    const ColIndices& colIndices) {
-    return derived(internal::eval_expr_given_size(rowIndices, derived.rows()),
-                   internal::eval_expr_given_size(colIndices, derived.cols()));
-  }
-};
-
-// this helper class assumes internal::is_valid_index_type<Indices>::value == false
-template <typename Indices, bool UseSymbolic = symbolic::is_symbolic<Indices>::value,
-          bool UseBlock = !UseSymbolic && internal::get_compile_time_incr<IvcType<Indices>>::value == 1,
-          bool UseGeneric = !UseSymbolic && !UseBlock>
-struct VectorIndexedViewSelector;
-
-// Generic
-template <typename Indices>
-struct VectorIndexedViewSelector<Indices, false, false, true> {
-  static constexpr bool IsRowMajor = DenseBase<Derived>::IsRowMajor;
-
-  using RowMajorReturnType = IndexedView<Derived, IvcIndex, IvcType<Indices>>;
-  using ConstRowMajorReturnType = IndexedView<const Derived, IvcIndex, IvcType<Indices>>;
-
-  using ColMajorReturnType = IndexedView<Derived, IvcType<Indices>, IvcIndex>;
-  using ConstColMajorReturnType = IndexedView<const Derived, IvcType<Indices>, IvcIndex>;
-
-  using ReturnType = typename internal::conditional<IsRowMajor, RowMajorReturnType, ColMajorReturnType>::type;
-  using ConstReturnType =
-      typename internal::conditional<IsRowMajor, ConstRowMajorReturnType, ConstColMajorReturnType>::type;
-
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
-  static inline RowMajorReturnType run(Derived& derived, const Indices& indices) {
-    return RowMajorReturnType(derived, IvcIndex(0), derived.ivcCol(indices));
-  }
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
-  static inline ConstRowMajorReturnType run(const Derived& derived, const Indices& indices) {
-    return ConstRowMajorReturnType(derived, IvcIndex(0), derived.ivcCol(indices));
-  }
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
-  static inline ColMajorReturnType run(Derived& derived, const Indices& indices) {
-    return ColMajorReturnType(derived, derived.ivcRow(indices), IvcIndex(0));
-  }
-  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
-  static inline ConstColMajorReturnType run(const Derived& derived, const Indices& indices) {
-    return ConstColMajorReturnType(derived, derived.ivcRow(indices), IvcIndex(0));
-  }
-};
-
-// Block
-template <typename Indices>
-struct VectorIndexedViewSelector<Indices, false, true, false> {
-  using ReturnType = VectorBlock<Derived, internal::array_size<Indices>::value>;
-  using ConstReturnType = VectorBlock<const Derived, internal::array_size<Indices>::value>;
-
-  static inline ReturnType run(Derived& derived, const Indices& indices) {
-    IvcType<Indices> actualIndices = derived.ivcSize(indices);
-    return ReturnType(derived, internal::first(actualIndices), internal::index_list_size(actualIndices));
-  }
-  static inline ConstReturnType run(const Derived& derived, const Indices& indices) {
-    IvcType<Indices> actualIndices = derived.ivcSize(indices);
-    return ConstReturnType(derived, internal::first(actualIndices), internal::index_list_size(actualIndices));
-  }
-};
-
-// Symbolic
-template <typename Indices>
-struct VectorIndexedViewSelector<Indices, true, false, false> {
-  using ReturnType = typename DenseBase<Derived>::Scalar&;
-  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
-
-  static inline ReturnType run(Derived& derived, const Indices& id) {
-    return derived(internal::eval_expr_given_size(id, derived.size()));
-  }
-  static inline ConstReturnType run(const Derived& derived, const Indices& id) {
-    return derived(internal::eval_expr_given_size(id, derived.size()));
-  }
-};
-
+public:
 // SFINAE dummy types
 
 template <typename RowIndices, typename ColIndices>
@@ -197,24 +32,26 @@ public:
 
 // non-const versions
 
-template <typename RowIndices, typename ColIndices>
-using IndexedViewType = typename IndexedViewSelector<RowIndices, ColIndices>::ReturnType;
+ template <typename RowIndices, typename ColIndices>
+ using IndexedViewType = typename internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::ReturnType;
 
-template <typename RowIndices, typename ColIndices, EnableOverload<RowIndices, ColIndices> = true>
-IndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
-}
+ template <typename RowIndices, typename ColIndices, EnableOverload<RowIndices, ColIndices> = true>
+ IndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
+   return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
+ }
 
 template <typename RowType, size_t RowSize, typename ColIndices, typename RowIndices = Array<RowType, RowSize, 1>,
           EnableOverload<RowIndices, ColIndices> = true>
 IndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize], const ColIndices& colIndices) {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices}, colIndices);
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             colIndices);
 }
 
 template <typename RowIndices, typename ColType, size_t ColSize, typename ColIndices = Array<ColType, ColSize, 1>,
           EnableOverload<RowIndices, ColIndices> = true>
 IndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices, const ColType (&colIndices)[ColSize]) {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), rowIndices, ColIndices{colIndices});
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices,
+                                                                             ColIndices{colIndices});
 }
 
 template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
@@ -222,32 +59,35 @@ template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
           EnableOverload<RowIndices, ColIndices> = true>
 IndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
                                                    const ColType (&colIndices)[ColSize]) {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices}, ColIndices{colIndices});
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             ColIndices{colIndices});
 }
 
 // const versions
 
 template <typename RowIndices, typename ColIndices>
-using ConstIndexedViewType = typename IndexedViewSelector<RowIndices, ColIndices>::ConstReturnType;
+using ConstIndexedViewType = typename internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::ConstReturnType;
 
 template <typename RowIndices, typename ColIndices, EnableConstOverload<RowIndices, ColIndices> = true>
 ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices,
                                                         const ColIndices& colIndices) const {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
 }
 
 template <typename RowType, size_t RowSize, typename ColIndices, typename RowIndices = Array<RowType, RowSize, 1>,
           EnableConstOverload<RowIndices, ColIndices> = true>
 ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
                                                         const ColIndices& colIndices) const {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices}, colIndices);
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             colIndices);
 }
 
 template <typename RowIndices, typename ColType, size_t ColSize, typename ColIndices = Array<ColType, ColSize, 1>,
           EnableConstOverload<RowIndices, ColIndices> = true>
 ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices,
                                                         const ColType (&colIndices)[ColSize]) const {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), rowIndices, ColIndices{colIndices});
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices,
+                                                                             ColIndices{colIndices});
 }
 
 template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
@@ -255,7 +95,8 @@ template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
           EnableConstOverload<RowIndices, ColIndices> = true>
 ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
                                                         const ColType (&colIndices)[ColSize]) const {
-  return IndexedViewSelector<RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices}, ColIndices{colIndices});
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             ColIndices{colIndices});
 }
 
 // Public API for 1D vectors/arrays
@@ -263,37 +104,37 @@ ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndic
 // non-const versions
 
 template <typename Indices>
-using VectorIndexedViewType = typename VectorIndexedViewSelector<Indices>::ReturnType;
+using VectorIndexedViewType = typename internal::VectorIndexedViewSelector<Derived, Indices>::ReturnType;
 
 template <typename Indices, EnableVectorOverload<Indices> = true>
 VectorIndexedViewType<Indices> operator()(const Indices& indices) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorIndexedViewSelector<Indices>::run(derived(), indices);
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), indices);
 }
 
 template <typename IndexType, size_t Size, typename Indices = Array<IndexType, Size, 1>,
           EnableVectorOverload<Indices> = true>
 VectorIndexedViewType<Indices> operator()(const IndexType (&indices)[Size]) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorIndexedViewSelector<Indices>::run(derived(), Indices{indices});
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), Indices{indices});
 }
 
 // const versions
 
 template <typename Indices>
-using ConstVectorIndexedViewType = typename VectorIndexedViewSelector<Indices>::ConstReturnType;
+using ConstVectorIndexedViewType = typename internal::VectorIndexedViewSelector<Derived, Indices>::ConstReturnType;
 
 template <typename Indices, EnableConstVectorOverload<Indices> = true>
 ConstVectorIndexedViewType<Indices> operator()(const Indices& indices) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorIndexedViewSelector<Indices>::run(derived(), indices);
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), indices);
 }
 
 template <typename IndexType, size_t Size, typename Indices = Array<IndexType, Size, 1>,
           EnableConstVectorOverload<Indices> = true>
 ConstVectorIndexedViewType<Indices> operator()(const IndexType (&indices)[Size]) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorIndexedViewSelector<Indices>::run(derived(), Indices{indices});
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), Indices{indices});
 }
 
 #else  // EIGEN_PARSED_BY_DOXYGEN
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
index 325b0fb..ffaf5aa 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
@@ -60,9 +60,6 @@ EIGEN_DEVICE_FUNC inline const CwiseSqrtReturnType cwiseSqrt() const { return Cw
 
 /// \returns an expression of the coefficient-wise cube root of *this.
 ///
-/// Example: \include MatrixBase_cwiseCbrt.cpp
-/// Output: \verbinclude MatrixBase_cwiseCbrt.out
-///
 EIGEN_DOC_UNARY_ADDONS(cwiseCbrt, cube - root)
 ///
 /// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
diff --git a/autodiff/common/eigen.hpp b/autodiff/common/eigen.hpp
index d89c06f..ff81192 100644
--- a/autodiff/common/eigen.hpp
+++ b/autodiff/common/eigen.hpp
@@ -99,27 +99,27 @@ struct VectorTraits<Eigen::VectorBlock<VectorType, Size>>
     using ReplaceValueType = VectorReplaceValueType<VectorType, NewValueType>;
 };
 
-#if EIGEN_VERSION_AT_LEAST(3, 3, 90)
+// #if EIGEN_VERSION_AT_LEAST(3, 3, 90)
 
-    template<typename VectorType, typename IndicesType>
-    struct VectorTraits<Eigen::IndexedView<VectorType, IndicesType, Eigen::internal::SingleRange>>
-    {
-        using ValueType = typename PlainType<VectorType>::Scalar;
+//     template<typename VectorType, typename IndicesType>
+//     struct VectorTraits<Eigen::IndexedView<VectorType, IndicesType, Eigen::internal::SingleRange>>
+//     {
+//         using ValueType = typename PlainType<VectorType>::Scalar;
 
-        template<typename NewValueType>
-        using ReplaceValueType = VectorReplaceValueType<VectorType, NewValueType>;
-    };
+//         template<typename NewValueType>
+//         using ReplaceValueType = VectorReplaceValueType<VectorType, NewValueType>;
+//     };
 
-    template<typename VectorType, typename IndicesType>
-    struct VectorTraits<Eigen::IndexedView<VectorType, Eigen::internal::SingleRange, IndicesType>>
-    {
-        using ValueType = typename PlainType<VectorType>::Scalar;
+//     template<typename VectorType, typename IndicesType>
+//     struct VectorTraits<Eigen::IndexedView<VectorType, Eigen::internal::SingleRange, IndicesType>>
+//     {
+//         using ValueType = typename PlainType<VectorType>::Scalar;
 
-        template<typename NewValueType>
-        using ReplaceValueType = VectorReplaceValueType<VectorType, NewValueType>;
-    };
+//         template<typename NewValueType>
+//         using ReplaceValueType = VectorReplaceValueType<VectorType, NewValueType>;
+//     };
 
-#endif
+// #endif
 
 template<typename MatrixType>
 struct VectorTraits<Eigen::Ref<MatrixType>>
diff --git a/autodiff/common/meta.hpp b/autodiff/common/meta.hpp
index 65da700..e310647 100644
--- a/autodiff/common/meta.hpp
+++ b/autodiff/common/meta.hpp
@@ -36,7 +36,8 @@
 
 #ifndef AUTODIFF_DEVICE_FUNC
 #ifdef AUTODIFF_EIGEN_FOUND
-    #include <Eigen/src/Core/util/Macros.h>
+    // #include <Eigen/src/Core/util/Macros.h>
+    #include <Eigen/Core>
     #define AUTODIFF_DEVICE_FUNC EIGEN_DEVICE_FUNC
 #else
     #define AUTODIFF_DEVICE_FUNC
diff --git a/dae-cpp/assert-custom.hpp b/dae-cpp/assert-custom.hpp
index a2c0814..043a1b2 100644
--- a/dae-cpp/assert-custom.hpp
+++ b/dae-cpp/assert-custom.hpp
@@ -14,6 +14,7 @@
 
 #include <iostream>
 
+// Macros do not care about namespaces, keeping it here in case of future changes
 namespace daecpp_namespace_name
 {
 
@@ -39,7 +40,7 @@ namespace daecpp_namespace_name
  * Example:
  * WARNING("v = " << v);
  */
-#ifdef TESTING
+#ifdef DAECPP_TESTING
 #define WARNING(msg)
 #else
 #define WARNING(msg) \
@@ -54,7 +55,7 @@ namespace daecpp_namespace_name
  * Example:
  * NOTE("v = " << v);
  */
-#ifdef TESTING
+#ifdef DAECPP_TESTING
 #define NOTE(msg)
 #else
 #define NOTE(msg) \
@@ -69,7 +70,7 @@ namespace daecpp_namespace_name
  * Example:
  * NOTE(verbosity > 1, "v = " << v);
  */
-#ifdef TESTING
+#ifdef DAECPP_TESTING
 #define PRINT(condition, msg)
 #else
 #define PRINT(condition, msg)          \
diff --git a/dae-cpp/jacobian-matrix.hpp b/dae-cpp/jacobian-matrix.hpp
index 6d8dc8c..7597efc 100644
--- a/dae-cpp/jacobian-matrix.hpp
+++ b/dae-cpp/jacobian-matrix.hpp
@@ -14,9 +14,11 @@
 #ifndef DAECPP_JACOBIAN_MATRIX_H
 #define DAECPP_JACOBIAN_MATRIX_H
 
-#include <iomanip> // Formatted table printing
-#include <utility> // std::pair
+#include <iomanip>     // Formatted table printing
+#include <utility>     // std::pair
+#include <type_traits> // std::is_same_v
 
+#include "typedefs.hpp"
 #include "sparse-matrix.hpp"
 #include "vector-function.hpp"
 
@@ -54,6 +56,9 @@ class JacobianMatrixShape
     // Array of non-zero elements
     std::vector<std::pair<int_type, int_type>> m_Jn;
 
+    // Cache for `dual` numbers
+    mutable state_type m_x_cache;
+
 public:
     explicit JacobianMatrixShape(RHS rhs) : m_rhs(rhs) {}
 
@@ -67,12 +72,16 @@ class JacobianMatrixShape
     {
         const int_type size = static_cast<int_type>(x.size()); // System size
 
-        state_type x_(size); // Vectors of `dual` numbers are defined with `_` suffix
+        // Resize cache if needed
+        if (m_x_cache.size() != size)
+        {
+            m_x_cache.resize(size);
+        }
 
-        // Conversion to dual numbers for automatic differentiation
+        // Conversion to `dual` numbers for automatic differentiation
         for (int_type k = 0; k < size; ++k)
         {
-            x_[k] = x[k];
+            m_x_cache[k] = x[k];
         }
 
         // Lambda-function with parameters for which the Jacobian is needed
@@ -85,9 +94,9 @@ class JacobianMatrixShape
         J.reserve(static_cast<int_type>(m_Jn.size()));
 
         // Automatic differentiation of each element marked as non-zero by the user
-        for (const std::pair<int_type, int_type> &Jn : m_Jn)
+        for (const auto &Jn : m_Jn)
         {
-            J(Jn.first, Jn.second, autodiff::derivative(f, wrt(x_[Jn.second]), at(x_, t, Jn.first)));
+            J(Jn.first, Jn.second, autodiff::derivative(f, wrt(m_x_cache[Jn.second]), at(m_x_cache, t, Jn.first)));
         }
     }
 
@@ -143,8 +152,13 @@ class JacobianAutomatic
      * Automatic (algorithmic) Jacobian.
      * Performs algorithmic differentiation of the RHS using `autodiff` package.
      */
-    void operator()(sparse_matrix &J, const state_vector &x, const double t)
+    template<typename MatrixType>
+    void operator()(MatrixType &J, const state_vector &x, const double t)
     {
+        static_assert (
+            std::is_same_v<MatrixType, sparse_matrix> || std::is_same_v<MatrixType, core::eimat>,
+            "void operator()(MatrixType &J, const state_vector &x, const double t): J must be daecpp::sparse_matrix or daecpp::core::eimat");
+
         const int_type size = static_cast<int_type>(x.size()); // System size
 
         state_type x_(size); // Vectors of `dual` numbers are defined with `_` suffix
@@ -166,19 +180,28 @@ class JacobianAutomatic
         // Dense Jacobian matrix generated by `autodiff`
         Eigen::MatrixXd jac = autodiff::jacobian(f, wrt(x_), at(x_, t));
 
-        // Convert dense matrix to sparse format
-        for (int_type j = 0; j < size; ++j)
+        if constexpr (std::is_same_v<MatrixType, sparse_matrix>)
         {
-            for (int_type i = 0; i < size; ++i)
+            // Convert dense matrix to daecpp::sparse_matrix format
+            for (int_type j = 0; j < size; ++j)
             {
-                const double val = jac(i, j);
-
-                if (std::abs(val) > DAECPP_SPARSE_MATRIX_ELEMENT_TOLERANCE)
+                for (int_type i = 0; i < size; ++i)
                 {
-                    J(i, j, val); // Jacobian
+                    const double val = jac(i, j);
+
+                    if (std::abs(val) > DAECPP_SPARSE_MATRIX_ELEMENT_TOLERANCE)
+                    {
+                        J(i, j, val); // Jacobian
+                    }
                 }
             }
         }
+        else
+        {
+            // Convert dense matrix to Eigen::SparseMatrix format directly
+            J = jac.sparseView();
+            // Matrix prune will be taken care of in the solver if scaling is enabled
+        }
     }
 };
 
@@ -290,7 +313,7 @@ class JacobianCompare
     {
         if (m_is_first_call)
         {
-            NOTE("Comparing the Jacobian matrices...");
+            NOTE("Comparing Jacobian matrices...");
             m_is_first_call = false;
         }
 
diff --git a/dae-cpp/solution-manager.hpp b/dae-cpp/solution-manager.hpp
index 693405b..4b1430f 100644
--- a/dae-cpp/solution-manager.hpp
+++ b/dae-cpp/solution-manager.hpp
@@ -28,7 +28,7 @@ namespace solver_command
 enum command
 {
     continue_integration = 0,       // Default, continue integration
-    stop_intergration = 1,          // Stop integration
+    stop_integration = 1,           // Stop integration
     decrease_time_step = 2,         // Decrease time step size (by a factor of SolverOptions::dt_decrease_factor)
     decrease_time_step_and_redo = 3 // Decrease time step size (by a factor of SolverOptions::dt_decrease_factor) and redo the current time step
 };
@@ -42,7 +42,7 @@ struct SolutionManager
     /*
      * Solution Manager functor will be called every time step providing the time `t` and
      * the corresponding solution `x` for further post-processing.
-     * If the functor returns an integer == 1 (or `solver_command::stop_intergration`), the computation will immediately stop.
+     * If the functor returns an integer == 1 (or `solver_command::stop_integration`), the computation will immediately stop.
      * If the functor returns `solver_command::decrease_time_step`, the solver will decrease the next time step size.
      * The functor can return `solver_command::decrease_time_step_and_redo` to decrease the time step size and redo the current time step.
      * It does nothing by default and returns 0 (`solver_command::continue_integration`).
diff --git a/dae-cpp/solver-options.hpp b/dae-cpp/solver-options.hpp
index 5ab5582..7afd077 100644
--- a/dae-cpp/solver-options.hpp
+++ b/dae-cpp/solver-options.hpp
@@ -138,6 +138,12 @@ struct SolverOptions
     // Default value is 2.0.
     double dt_decrease_factor{2.0};
 
+    // Matrix scaling flag.
+    // If `true`, the solver will scale the linear system matrix before solving it.
+    // Matrix scaling can improve the stability of the linear solver in some cases.
+    // Default value is `false`.
+    bool linear_system_scaling{false};
+
     // Number of threads
     unsigned int num_threads{1}; // TODO: Not used yet
 
diff --git a/dae-cpp/solver.hpp b/dae-cpp/solver.hpp
index e00b437..e914452 100644
--- a/dae-cpp/solver.hpp
+++ b/dae-cpp/solver.hpp
@@ -332,9 +332,10 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
         eimat Jb; // Linear system matrix
 
         // Eigen::VectorX vectors
-        eivec f_(size); // The RHS vector (converted)
-        eivec b;        // The RHS of the linear system
-        eivec dx;       // Linear system solution
+        eivec f_(size);       // The RHS vector (converted)
+        eivec rowscale(size); // Row scaling factors
+        eivec b;              // The RHS of the linear system
+        eivec dx;             // Linear system solution
 
         // Counts number of time steps
         uint64_t n_steps{0};
@@ -355,7 +356,7 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
         try
         {
             Timer timer(&t[timer::manager]);
-            if (mgr(x0, 0.0) == solver_command::stop_intergration)
+            if (mgr(x0, 0.0) == solver_command::stop_integration)
             {
                 PRINT(opt.verbosity >= 1, "Stop event in Solution Manager triggered.");
                 error_msg = exit_code::success;
@@ -496,10 +497,19 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
                         try
                         {
                             Timer timer(&t[timer::jacobian]);
-                            J.clear();
-                            jac(J, xk, state.t);
-                            J.check();
-                            Jb = J.convert(static_cast<int_type>(size));
+                            if constexpr (std::is_same_v<Jacobian, JacobianAutomatic<RHS>>)
+                            {
+                                // Automatic Jacobian in Eigen::SparseMatrix format
+                                jac(Jb, xk, state.t);
+                            }
+                            else
+                            {
+                                // Jacobian matrix in daecpp::sparse_matrix format
+                                J.clear();
+                                jac(J, xk, state.t);
+                                J.check();
+                                Jb = J.convert(static_cast<int_type>(size));
+                            }
                         }
                         catch (const std::exception &e)
                         {
@@ -521,6 +531,55 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
                         if (is_fact_enabled)
                         {
                             Jb -= M_ * alpha;
+
+                            // Matrix scaling if enabled
+                            if (opt.linear_system_scaling)
+                            {
+                                rowscale.setZero();
+
+                                // Find max abs per row
+                                for (int col = 0; col < Jb.outerSize(); ++col)
+                                {
+                                    for (eimat::InnerIterator it(Jb, col); it; ++it)
+                                    {
+                                        int i = it.row();
+                                        double val = std::abs(it.value());
+                                        if (val > rowscale[i])
+                                        {
+                                            rowscale[i] = val;
+                                        }
+                                    }
+                                }
+
+                                // Convert to scaling factors (1 / max), protect zero rows
+                                for (int i = 0; i < rowscale.size(); ++i)
+                                {
+                                    if (rowscale[i] > 0.0)
+                                    {
+                                        rowscale[i] = 1.0 / rowscale[i];
+                                    }
+                                    else
+                                    {
+                                        rowscale[i] = 1.0; // zero row -> no scaling
+                                    }
+                                }
+
+                                // Apply row scaling to matrix
+                                for (int col = 0; col < Jb.outerSize(); ++col)
+                                {
+                                    for (eimat::InnerIterator it(Jb, col); it; ++it)
+                                    {
+                                        int i = it.row();
+                                        it.valueRef() *= rowscale[i];
+                                    }
+                                }
+                            }
+                        }
+
+                        if (opt.linear_system_scaling)
+                        {
+                            // Apply row scaling to RHS
+                            b = b.cwiseProduct(rowscale);
                         }
                     }
                     catch (const std::exception &e)
@@ -534,7 +593,19 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
                     {
                         Timer timer(&t[timer::factorization]);
 
-                        linsolver.compute(Jb);
+                        // Prepare linear system matrix
+                        if (opt.linear_system_scaling)
+                        {
+                            Jb.prune(DAECPP_SPARSE_MATRIX_ELEMENT_TOLERANCE);
+                        }
+                        Jb.makeCompressed();
+
+                        // TODO: May actually need to analyze pattern more often if the sparsity pattern changes over time
+                        if(iter == 0 && state.t <= dt)
+                        {
+                            linsolver.analyzePattern(Jb); // Analyze the sparsity pattern only once at the first iteration
+                        }
+                        linsolver.factorize(Jb);
 
                         c.n_fact_calls++;
 
@@ -677,7 +748,7 @@ inline exit_code::status solve(Mass mass, RHS rhs, Jacobian jac, Manager mgr, co
                         }
                         continue;
                     }
-                    else if (command) // solver_command::stop_intergration
+                    else if (command) // solver_command::stop_integration
                     {
                         print_char(opt.verbosity >= 2, '\n');
                         PRINT(opt.verbosity >= 1, "Stop event in Solution Manager triggered.");
diff --git a/dae-cpp/sparse-matrix.hpp b/dae-cpp/sparse-matrix.hpp
index 18efed2..5ca840f 100644
--- a/dae-cpp/sparse-matrix.hpp
+++ b/dae-cpp/sparse-matrix.hpp
@@ -95,9 +95,8 @@ struct sparse_matrix
      */
     inline void check() const noexcept
     {
-        constexpr char msg[] = "Three-array sparse matrix check failed. Inconsistent array size.";
-        ASSERT(A.size() == i.size(), msg);
-        ASSERT(A.size() == j.size(), msg);
+        ASSERT(A.size() == i.size(), "Three-array sparse matrix check failed. Array A and i have inconsistent sizes.");
+        ASSERT(A.size() == j.size(), "Three-array sparse matrix check failed. Array A and j have inconsistent sizes.");
     }
 
     /*
@@ -146,7 +145,7 @@ struct sparse_matrix
             M.coeffRef(i[k], j[k]) += A[k];
         }
 
-        // M.makeCompressed(); // It is already compressed
+        // M.makeCompressed(); // Will be taken care of in the solver before factorization
 
         return M;
     }
diff --git a/dae-cpp/typedefs.hpp b/dae-cpp/typedefs.hpp
index 1960b82..1b28abc 100644
--- a/dae-cpp/typedefs.hpp
+++ b/dae-cpp/typedefs.hpp
@@ -23,17 +23,17 @@
 
 // dae-cpp version
 #define DAECPP_VERSION_MAJOR 2
-#define DAECPP_VERSION_MINOR 2
+#define DAECPP_VERSION_MINOR 3
 #define DAECPP_VERSION_PATCH 0
 
 // Internal constants
 #define DAECPP_MAX_ORDER 4
 #ifndef DAECPP_SINGLE
 #define DAECPP_SPARSE_MATRIX_ELEMENT_TOLERANCE 1e-14 // Used in automatic (algorithmic) Jacobian
-#define DAECPP_FLOAT_TOLERANCE 1e-14                 // Used in the solver for convergence check against relative tolerance
+#define DAECPP_FLOAT_TOLERANCE 1e-8                  // Used in the solver for convergence check against relative tolerance
 #else
 #define DAECPP_SPARSE_MATRIX_ELEMENT_TOLERANCE 1e-6 // Used in automatic (algorithmic) Jacobian
-#define DAECPP_FLOAT_TOLERANCE 1e-6                 // Used in the solver for convergence check against relative tolerance
+#define DAECPP_FLOAT_TOLERANCE 1e-3                 // Used in the solver for convergence check against relative tolerance
 #endif
 
 #include "assert-custom.hpp"
@@ -54,7 +54,7 @@ enum status
 };
 } // namespace exit_code
 
-// Unsigned integer type
+// Integer type
 #ifdef DAECPP_LONG
 typedef std::int64_t int_type;
 #else
@@ -77,7 +77,7 @@ typedef autodiff::real1st state_value;
 namespace core
 {
 
-// Unsigned integer vector
+// Integer vector
 typedef std::vector<int_type> ivec;
 
 // Floating point (double or single precision) vector
diff --git a/dae-cpp/version.hpp b/dae-cpp/version.hpp
index 7c46777..d867ce2 100644
--- a/dae-cpp/version.hpp
+++ b/dae-cpp/version.hpp
@@ -18,13 +18,13 @@ namespace daecpp_namespace_name
 {
 
 // dae-cpp library major version
-static constexpr uint16_t version_major{DAECPP_VERSION_MAJOR};
+inline constexpr uint16_t version_major{DAECPP_VERSION_MAJOR};
 
 // dae-cpp library minor version
-static constexpr uint16_t version_minor{DAECPP_VERSION_MINOR};
+inline constexpr uint16_t version_minor{DAECPP_VERSION_MINOR};
 
 // dae-cpp library patch version
-static constexpr uint16_t version_patch{DAECPP_VERSION_PATCH};
+inline constexpr uint16_t version_patch{DAECPP_VERSION_PATCH};
 
 } // namespace daecpp_namespace_name
 
diff --git a/tests/test_solution-manager.cpp b/tests/test_solution-manager.cpp
index 512d26c..c3ec327 100644
--- a/tests/test_solution-manager.cpp
+++ b/tests/test_solution-manager.cpp
@@ -155,7 +155,7 @@ class MySolutionManager
         if (std::abs(x[0] - 1.0) < abs_err)
         {
             m_save_solution(x, t);
-            return solver_command::stop_intergration;
+            return solver_command::stop_integration;
         }
 
         if (x[0] < 1.0)
@@ -219,7 +219,7 @@ class MySolutionManagerStop
 
         if (x[0] < 0.0)
         {
-            return -1; // solver_command::stop_intergration;
+            return -1; // solver_command::stop_integration;
         }
 
         return 0; // solver_command::continue_integration;