diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl
index e93206e44..e3f5b4d1c 100644
--- a/OpenCL/inc_hash_sha224.cl
+++ b/OpenCL/inc_hash_sha224.cl
@@ -104,6 +104,11 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
 
   ROUND_STEP_S (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -111,6 +116,7 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
   {
     ROUND_EXPAND_S (); ROUND_STEP_S (i);
   }
+  #endif
 
   #undef ROUND_EXPAND_S
   #undef ROUND_STEP_S
diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl
index de2bd5897..d6d0a69b0 100644
--- a/OpenCL/inc_hash_sha256.cl
+++ b/OpenCL/inc_hash_sha256.cl
@@ -104,6 +104,11 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
 
   ROUND_STEP_S (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -111,6 +116,7 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
   {
     ROUND_EXPAND_S (); ROUND_STEP_S (i);
   }
+  #endif
 
   #undef ROUND_EXPAND_S
   #undef ROUND_STEP_S
diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl
index ea26ec734..61e2f5d3a 100644
--- a/OpenCL/inc_hash_sha384.cl
+++ b/OpenCL/inc_hash_sha384.cl
@@ -108,6 +108,12 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
 
   ROUND_STEP_S (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  ROUND_EXPAND_S (); ROUND_STEP_S (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -115,6 +121,7 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
   {
     ROUND_EXPAND_S (); ROUND_STEP_S (i);
   }
+  #endif
 
   #undef ROUND_EXPAND_S
   #undef ROUND_STEP_S
diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl
index 783a66fbe..6dc91f368 100644
--- a/OpenCL/inc_hash_sha512.cl
+++ b/OpenCL/inc_hash_sha512.cl
@@ -108,6 +108,12 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
 
   ROUND_STEP_S (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  ROUND_EXPAND_S (); ROUND_STEP_S (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -115,6 +121,7 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
   {
     ROUND_EXPAND_S (); ROUND_STEP_S (i);
   }
+  #endif
 
   #undef ROUND_EXPAND_S
   #undef ROUND_STEP_S
diff --git a/OpenCL/m01700_a0-optimized.cl b/OpenCL/m01700_a0-optimized.cl
index bd71b1678..0d6ddb337 100644
--- a/OpenCL/m01700_a0-optimized.cl
+++ b/OpenCL/m01700_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01700_a1-optimized.cl b/OpenCL/m01700_a1-optimized.cl
index 2716dbdec..abee4dfcb 100644
--- a/OpenCL/m01700_a1-optimized.cl
+++ b/OpenCL/m01700_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01700_a3-optimized.cl b/OpenCL/m01700_a3-optimized.cl
index 5e2820592..c4d8ee016 100644
--- a/OpenCL/m01700_a3-optimized.cl
+++ b/OpenCL/m01700_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a0-optimized.cl b/OpenCL/m01710_a0-optimized.cl
index 6f3130ac9..a5a53e831 100644
--- a/OpenCL/m01710_a0-optimized.cl
+++ b/OpenCL/m01710_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a1-optimized.cl b/OpenCL/m01710_a1-optimized.cl
index 8d8a87380..931142cae 100644
--- a/OpenCL/m01710_a1-optimized.cl
+++ b/OpenCL/m01710_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01710_a3-optimized.cl b/OpenCL/m01710_a3-optimized.cl
index 353d1395b..a82f949ac 100644
--- a/OpenCL/m01710_a3-optimized.cl
+++ b/OpenCL/m01710_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a0-optimized.cl b/OpenCL/m01720_a0-optimized.cl
index 14503946d..c331365f1 100644
--- a/OpenCL/m01720_a0-optimized.cl
+++ b/OpenCL/m01720_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a1-optimized.cl b/OpenCL/m01720_a1-optimized.cl
index fbacd956b..aa93dc2c9 100644
--- a/OpenCL/m01720_a1-optimized.cl
+++ b/OpenCL/m01720_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl
index 168cb7f96..891634dd4 100644
--- a/OpenCL/m01720_a3-optimized.cl
+++ b/OpenCL/m01720_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a0-optimized.cl b/OpenCL/m01730_a0-optimized.cl
index 2d1d4d6bc..f5da15e7f 100644
--- a/OpenCL/m01730_a0-optimized.cl
+++ b/OpenCL/m01730_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a1-optimized.cl b/OpenCL/m01730_a1-optimized.cl
index 7515d2cc0..f3cd8d89a 100644
--- a/OpenCL/m01730_a1-optimized.cl
+++ b/OpenCL/m01730_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01730_a3-optimized.cl b/OpenCL/m01730_a3-optimized.cl
index d2f01afd8..e00e5f4ae 100644
--- a/OpenCL/m01730_a3-optimized.cl
+++ b/OpenCL/m01730_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a0-optimized.cl b/OpenCL/m01740_a0-optimized.cl
index 717aba0c0..ee38662e8 100644
--- a/OpenCL/m01740_a0-optimized.cl
+++ b/OpenCL/m01740_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a1-optimized.cl b/OpenCL/m01740_a1-optimized.cl
index aa765e337..0ae6984e0 100644
--- a/OpenCL/m01740_a1-optimized.cl
+++ b/OpenCL/m01740_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl
index 8b6322f85..4b7b1d3df 100644
--- a/OpenCL/m01740_a3-optimized.cl
+++ b/OpenCL/m01740_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m08000_a0-optimized.cl b/OpenCL/m08000_a0-optimized.cl
index f259ea250..310bebbeb 100644
--- a/OpenCL/m08000_a0-optimized.cl
+++ b/OpenCL/m08000_a0-optimized.cl
@@ -86,6 +86,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +98,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -137,6 +143,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -144,6 +155,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
diff --git a/OpenCL/m08000_a1-optimized.cl b/OpenCL/m08000_a1-optimized.cl
index 599364f44..89ea42a57 100644
--- a/OpenCL/m08000_a1-optimized.cl
+++ b/OpenCL/m08000_a1-optimized.cl
@@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
diff --git a/OpenCL/m08000_a3-optimized.cl b/OpenCL/m08000_a3-optimized.cl
index 3068a6621..fa76a3b72 100644
--- a/OpenCL/m08000_a3-optimized.cl
+++ b/OpenCL/m08000_a3-optimized.cl
@@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
@@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)
 
   ROUND_STEP_Z (0);
 
+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
   {
     ROUND_STEP_Z (i);
   }
+  #endif
 
   digest[0] += a;
   digest[1] += b;
diff --git a/OpenCL/m10800_a0-optimized.cl b/OpenCL/m10800_a0-optimized.cl
index e9cfd2167..396b389a6 100644
--- a/OpenCL/m10800_a0-optimized.cl
+++ b/OpenCL/m10800_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m10800_a1-optimized.cl b/OpenCL/m10800_a1-optimized.cl
index f7828aa9f..11aa95dbd 100644
--- a/OpenCL/m10800_a1-optimized.cl
+++ b/OpenCL/m10800_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m10800_a3-optimized.cl b/OpenCL/m10800_a3-optimized.cl
index f19e1b224..cef22d51f 100644
--- a/OpenCL/m10800_a3-optimized.cl
+++ b/OpenCL/m10800_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m15000_a0-optimized.cl b/OpenCL/m15000_a0-optimized.cl
index a4fe67e48..7b73564a3 100644
--- a/OpenCL/m15000_a0-optimized.cl
+++ b/OpenCL/m15000_a0-optimized.cl
@@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m15000_a1-optimized.cl b/OpenCL/m15000_a1-optimized.cl
index e410b3102..1bcd7a983 100644
--- a/OpenCL/m15000_a1-optimized.cl
+++ b/OpenCL/m15000_a1-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/OpenCL/m15000_a3-optimized.cl b/OpenCL/m15000_a3-optimized.cl
index 5ff6d7bfb..cf1ff1432 100644
--- a/OpenCL/m15000_a3-optimized.cl
+++ b/OpenCL/m15000_a3-optimized.cl
@@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
 
   ROUND_STEP (0);
 
+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
   #ifdef _unroll
   #pragma unroll
   #endif
@@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
   {
     ROUND_EXPAND (); ROUND_STEP (i);
   }
+  #endif
 
   /* rev
   digest[0] += a;
diff --git a/src/autotune.c b/src/autotune.c
index 43b5b46bb..11dc8c1d1 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -47,6 +47,7 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   return exec_msec_prev;
 }
 
+/*
 static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
 {
   hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
@@ -93,6 +94,7 @@ static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *
 
   return exec_msec_prev;
 }
+*/
 
 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
@@ -261,6 +263,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     const u32 kernel_accel_orig = kernel_accel;
     const u32 kernel_loops_orig = kernel_loops;
 
+    double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+
     for (int i = 1; i < STEPS_CNT; i++)
     {
       const u32 kernel_accel_try = kernel_accel_orig * (1u << i);
@@ -272,6 +276,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       if (kernel_loops_try > kernel_loops_max) continue;
       if (kernel_loops_try < kernel_loops_min) break;
 
+      // do a real test
+
+      const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try);
+
+      if (exec_msec_prev < exec_msec) break;
+
+      exec_msec_prev = exec_msec;
+
+      // so far, so good! save
+
       kernel_accel = kernel_accel_try;
       kernel_loops = kernel_loops_try;
 
@@ -299,6 +313,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   // start finding best thread count is easier.
   // it's either the preferred or the maximum thread count
 
+  /*
   const u32 kernel_threads_min = device_param->kernel_threads_min;
   const u32 kernel_threads_max = device_param->kernel_threads_max;
 
@@ -334,6 +349,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       }
     }
   }
+  */
 
   if (device_param->is_cuda == true)
   {