/**
 * Author......: See docs/credits.txt
 * License.....: MIT
 */

/*
 * Prototype kernel function that fits all kernel macros
 *
 * There are four variables where major differences occur:
 *
 *   -  P2: Adress space of kernel_rules_t struct.
 *          If the kernel uses rules_buf, it will be stored in __constant.
 *          If it does not, cheaper __global space is used.
 *
 *   -  P4: Innerloop word buffer:
 *          Most kernels use a bf_t structure in __global address space (_BASIC).
 *          Some use u32x pointer to a vector in __constant address space (_VECTOR).
 *          A few use a specific bs_word_t struct (_BITSLICE).
 *
 *   -  P5: Type of the tmps structure with additional data, or void.
 *          Used with slow hash types (ATTACK_EXEC_OUTSIDE_KERNEL) only.
 *
 *   - P19: Type of the esalt_bufs structure with additional data, or void.
 */

#define KERN_ATTR(p2,p4,p5,p6,p19)                           \
  __global       pw_t          * restrict pws,               \
  p2       const kernel_rule_t * restrict rules_buf,         \
  __global const pw_t          * restrict combs_buf,         \
  p4,                                                        \
  __global p5                  * restrict tmps,              \
  __global p6                  * restrict hooks,             \
  __global const u32           * restrict bitmaps_buf_s1_a,  \
  __global const u32           * restrict bitmaps_buf_s1_b,  \
  __global const u32           * restrict bitmaps_buf_s1_c,  \
  __global const u32           * restrict bitmaps_buf_s1_d,  \
  __global const u32           * restrict bitmaps_buf_s2_a,  \
  __global const u32           * restrict bitmaps_buf_s2_b,  \
  __global const u32           * restrict bitmaps_buf_s2_c,  \
  __global const u32           * restrict bitmaps_buf_s2_d,  \
  __global       plain_t       * restrict plains_buf,        \
  __global const digest_t      * restrict digests_buf,       \
  __global       u32           * restrict hashes_shown,      \
  __global const salt_t        * restrict salt_bufs,         \
  __global const p19           * restrict esalt_bufs,        \
  __global       u32           * restrict d_return_buf,      \
  __global       void          * restrict d_extra0_buf,      \
  __global       void          * restrict d_extra1_buf,      \
  __global       void          * restrict d_extra2_buf,      \
  __global       void          * restrict d_extra3_buf,      \
  const u32 bitmap_mask,    \
  const u32 bitmap_shift1,  \
  const u32 bitmap_shift2,  \
  const u32 salt_pos,       \
  const u32 loop_pos,       \
  const u32 loop_cnt,       \
  const u32 il_cnt,         \
  const u32 digests_cnt,    \
  const u32 digests_offset, \
  const u32 combs_mode,     \
  const u64 gid_max

/*
 * Shortcut macros for usage in the actual kernels
 *
 * Not all possible combinations are needed. E.g. all kernels that use rules
 * do not use the tmps pointer, all kernels that use a vector pointer in P4
 * do not use rules or tmps, etc.
 */

#define KERN_ATTR_BASIC()         KERN_ATTR (__global,   __global   const bf_t      * restrict bfs_buf,     void, void, void)
#define KERN_ATTR_BITSLICE()      KERN_ATTR (__global,   __constant const bs_word_t * restrict words_buf_r, void, void, void)
#define KERN_ATTR_ESALT(e)        KERN_ATTR (__global,   __global   const bf_t      * restrict bfs_buf,     void, void, e)
#define KERN_ATTR_RULES()         KERN_ATTR (__constant, __global   const bf_t      * restrict bfs_buf,     void, void, void)
#define KERN_ATTR_RULES_ESALT(e)  KERN_ATTR (__constant, __global   const bf_t      * restrict bfs_buf,     void, void, e)
#define KERN_ATTR_TMPS(t)         KERN_ATTR (__global,   __global   const bf_t      * restrict bfs_buf,     t,    void, void)
#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (__global,   __global   const bf_t      * restrict bfs_buf,     t,    void, e)
#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (__global,   __global   const bf_t      * restrict bfs_buf,     t,    h,    void)
#define KERN_ATTR_VECTOR()        KERN_ATTR (__global,   __constant const u32x      * restrict words_buf_r, void, void, void)
#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (__global,   __constant const u32x      * restrict words_buf_r, void, void, e)

/**
 * pure scalar functions
 */

DECLSPEC int ffz (const u32 v)
{
  #ifdef _unroll
  #pragma unroll
  #endif
  for (int i = 0; i < 32; i++)
  {
    if ((v >> i) & 1) continue;

    return i;
  }

  return -1;
}

DECLSPEC int hash_comp (const u32 *d1, __global const u32 *d2)
{
  if (d1[3] > d2[DGST_R3]) return ( 1);
  if (d1[3] < d2[DGST_R3]) return (-1);
  if (d1[2] > d2[DGST_R2]) return ( 1);
  if (d1[2] < d2[DGST_R2]) return (-1);
  if (d1[1] > d2[DGST_R1]) return ( 1);
  if (d1[1] < d2[DGST_R1]) return (-1);
  if (d1[0] > d2[DGST_R0]) return ( 1);
  if (d1[0] < d2[DGST_R0]) return (-1);

  return (0);
}

DECLSPEC int find_hash (const u32 *digest, const u32 digests_cnt, __global const digest_t *digests_buf)
{
  for (u32 l = 0, r = digests_cnt; r; r >>= 1)
  {
    const u32 m = r >> 1;

    const u32 c = l + m;

    const int cmp = hash_comp (digest, digests_buf[c].digest_buf);

    if (cmp > 0)
    {
      l += m + 1;

      r--;
    }

    if (cmp == 0) return (c);
  }

  return (-1);
}

DECLSPEC u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
{
  return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
}

DECLSPEC u32 check (const u32 *digest, __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
{
  if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
  if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
  if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
  if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);

  if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
  if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
  if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
  if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);

  return (1);
}

DECLSPEC void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u64 gid, const u32 il_pos)
{
  const u32 idx = atomic_inc (d_result);

  if (idx >= digests_cnt)
  {
    // this is kind of tricky: we *must* call atomic_inc() to know about the current value from a multi-thread perspective
    // this action creates a buffer overflow, so we need to fix it here

    atomic_dec (d_result);

    return;
  }

  plains_buf[idx].salt_pos    = salt_pos;
  plains_buf[idx].digest_pos  = digest_pos; // relative
  plains_buf[idx].hash_pos    = hash_pos;   // absolute
  plains_buf[idx].gidvid      = gid;
  plains_buf[idx].il_pos      = il_pos;
}

DECLSPEC int count_char (const u32 *buf, const int elems, const u32 c)
{
  int r = 0;

  for (int i = 0; i < elems; i++)
  {
    const u32 v = buf[i];

    if (((v >>  0) & 0xff) == c) r++;
    if (((v >>  8) & 0xff) == c) r++;
    if (((v >> 16) & 0xff) == c) r++;
    if (((v >> 24) & 0xff) == c) r++;
  }

  return r;
}

DECLSPEC float get_entropy (const u32 *buf, const int elems)
{
  const int length = elems * 4;

  float entropy = 0.0;

  #ifdef _unroll
  #pragma unroll
  #endif
  for (u32 c = 0; c < 256; c++)
  {
    const int r = count_char (buf, elems, c);

    if (r == 0) continue;

    float w = (float) r / length;

    entropy += -w * log2 (w);
  }

  return entropy;
}

DECLSPEC int is_valid_hex_8 (const u8 v)
{
  // direct lookup table is slower thanks to CMOV

  if ((v >= '0') && (v <= '9')) return 1;
  if ((v >= 'a') && (v <= 'f')) return 1;

  return 0;
}

DECLSPEC int is_valid_hex_32 (const u32 v)
{
  if (is_valid_hex_8 ((u8) (v >>  0)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >>  8)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >> 16)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >> 24)) == 0) return 0;

  return 1;
}

DECLSPEC int is_valid_base58_8 (const u8 v)
{
  if (v > 'z') return 0;
  if (v < '1') return 0;
  if ((v > '9') && (v < 'A')) return 0;
  if ((v > 'Z') && (v < 'a')) return 0;

  return 1;
}

DECLSPEC int is_valid_base58_32 (const u32 v)
{
  if (is_valid_base58_8 ((u8) (v >>  0)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >>  8)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >> 16)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >> 24)) == 0) return 0;

  return 1;
}

DECLSPEC int find_keyboard_layout_map (const u32 search, const int search_len, __local keyboard_layout_mapping_t *s_keyboard_layout_mapping_buf, const int keyboard_layout_mapping_cnt)
{
  for (int idx = 0; idx < keyboard_layout_mapping_cnt; idx++)
  {
    const u32 src_char = s_keyboard_layout_mapping_buf[idx].src_char;
    const int src_len  = s_keyboard_layout_mapping_buf[idx].src_len;

    if (src_len == search_len)
    {
      const u32 mask = 0xffffffff >> ((4 - search_len) * 8);

      if ((src_char & mask) == (search & mask)) return idx;
    }
  }

  return -1;
}

DECLSPEC int execute_keyboard_layout_mapping (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int pw_len, __local keyboard_layout_mapping_t *s_keyboard_layout_mapping_buf, const int keyboard_layout_mapping_cnt)
{
  u32 out_buf[16] = { 0 };

  u8 *out_ptr = (u8 *) out_buf;

  int out_len = 0;

  // TC/VC passwords are limited to 64

  u32 w[16];

  w[ 0] = w0[0];
  w[ 1] = w0[1];
  w[ 2] = w0[2];
  w[ 3] = w0[3];
  w[ 4] = w1[0];
  w[ 5] = w1[1];
  w[ 6] = w1[2];
  w[ 7] = w1[3];
  w[ 8] = w2[0];
  w[ 9] = w2[1];
  w[10] = w2[2];
  w[11] = w2[3];
  w[12] = w3[0];
  w[13] = w3[1];
  w[14] = w3[2];
  w[15] = w3[3];

  u8 *w_ptr = (u8 *) w;

  int pw_pos = 0;

  while (pw_pos < pw_len)
  {
    u32 src0 = 0;
    u32 src1 = 0;
    u32 src2 = 0;
    u32 src3 = 0;

    #define MIN(a,b) (((a) < (b)) ? (a) : (b))

    const int rem = MIN (pw_len - pw_pos, 4);

    #undef MIN

    if (rem > 0) src0 = w_ptr[pw_pos + 0];
    if (rem > 1) src1 = w_ptr[pw_pos + 1];
    if (rem > 2) src2 = w_ptr[pw_pos + 2];
    if (rem > 3) src3 = w_ptr[pw_pos + 3];

    const u32 src = (src0 <<  0)
                  | (src1 <<  8)
                  | (src2 << 16)
                  | (src3 << 24);

    int src_len;

    for (src_len = rem; src_len > 0; src_len--)
    {
      const int idx = find_keyboard_layout_map (src, src_len, s_keyboard_layout_mapping_buf, keyboard_layout_mapping_cnt);

      if (idx == -1) continue;

      u32 dst_char = s_keyboard_layout_mapping_buf[idx].dst_char;
      int dst_len  = s_keyboard_layout_mapping_buf[idx].dst_len;

      switch (dst_len)
      {
        case 1:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          break;
        case 2:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          break;
        case 3:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          out_ptr[out_len++] = (dst_char >> 16) & 0xff;
          break;
        case 4:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          out_ptr[out_len++] = (dst_char >> 16) & 0xff;
          out_ptr[out_len++] = (dst_char >> 24) & 0xff;
          break;
      }

      pw_pos += src_len;

      break;
    }

    // not matched, keep original

    if (src_len == 0)
    {
      out_ptr[out_len] = w_ptr[pw_pos];

      out_len++;

      pw_pos++;
    }
  }

  w0[0] = out_buf[ 0];
  w0[1] = out_buf[ 1];
  w0[2] = out_buf[ 2];
  w0[3] = out_buf[ 3];
  w1[0] = out_buf[ 4];
  w1[1] = out_buf[ 5];
  w1[2] = out_buf[ 6];
  w1[3] = out_buf[ 7];
  w2[0] = out_buf[ 8];
  w2[1] = out_buf[ 9];
  w2[2] = out_buf[10];
  w2[3] = out_buf[11];
  w3[0] = out_buf[12];
  w3[1] = out_buf[13];
  w3[2] = out_buf[14];
  w3[3] = out_buf[15];

  return out_len;
}

/**
 * vector functions
 */

DECLSPEC void make_utf16be (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x3727);
  out2[2] = hc_byte_perm (in[3], 0, 0x1707);
  out2[1] = hc_byte_perm (in[2], 0, 0x3727);
  out2[0] = hc_byte_perm (in[2], 0, 0x1707);
  out1[3] = hc_byte_perm (in[1], 0, 0x3727);
  out1[2] = hc_byte_perm (in[1], 0, 0x1707);
  out1[1] = hc_byte_perm (in[0], 0, 0x3727);
  out1[0] = hc_byte_perm (in[0], 0, 0x1707);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm (in[3], 0, 0x03070207);
  out2[2] = hc_byte_perm (in[3], 0, 0x01070007);
  out2[1] = hc_byte_perm (in[2], 0, 0x03070207);
  out2[0] = hc_byte_perm (in[2], 0, 0x01070007);
  out1[3] = hc_byte_perm (in[1], 0, 0x03070207);
  out1[2] = hc_byte_perm (in[1], 0, 0x01070007);
  out1[1] = hc_byte_perm (in[0], 0, 0x03070207);
  out1[0] = hc_byte_perm (in[0], 0, 0x01070007);

  #else

  out2[3] = ((in[3] >>  0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00);
  out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00);
  out2[1] = ((in[2] >>  0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00);
  out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00);
  out1[3] = ((in[1] >>  0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00);
  out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00);
  out1[1] = ((in[0] >>  0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00);
  out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00);

  #endif
}

DECLSPEC void make_utf16beN (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x1707);
  out2[2] = hc_byte_perm (in[3], 0, 0x3727);
  out2[1] = hc_byte_perm (in[2], 0, 0x1707);
  out2[0] = hc_byte_perm (in[2], 0, 0x3727);
  out1[3] = hc_byte_perm (in[1], 0, 0x1707);
  out1[2] = hc_byte_perm (in[1], 0, 0x3727);
  out1[1] = hc_byte_perm (in[0], 0, 0x1707);
  out1[0] = hc_byte_perm (in[0], 0, 0x3727);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm (in[3], 0, 0x01070007);
  out2[2] = hc_byte_perm (in[3], 0, 0x03070207);
  out2[1] = hc_byte_perm (in[2], 0, 0x01070007);
  out2[0] = hc_byte_perm (in[2], 0, 0x03070207);
  out1[3] = hc_byte_perm (in[1], 0, 0x01070007);
  out1[2] = hc_byte_perm (in[1], 0, 0x03070207);
  out1[1] = hc_byte_perm (in[0], 0, 0x01070007);
  out1[0] = hc_byte_perm (in[0], 0, 0x03070207);

  #else

  out2[3] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00);
  out2[2] = ((in[3] >>  0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00);
  out2[1] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00);
  out2[0] = ((in[2] >>  0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00);
  out1[3] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00);
  out1[2] = ((in[1] >>  0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00);
  out1[1] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00);
  out1[0] = ((in[0] >>  0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00);

  #endif
}

DECLSPEC void make_utf16le (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x7372);
  out2[2] = hc_byte_perm (in[3], 0, 0x7170);
  out2[1] = hc_byte_perm (in[2], 0, 0x7372);
  out2[0] = hc_byte_perm (in[2], 0, 0x7170);
  out1[3] = hc_byte_perm (in[1], 0, 0x7372);
  out1[2] = hc_byte_perm (in[1], 0, 0x7170);
  out1[1] = hc_byte_perm (in[0], 0, 0x7372);
  out1[0] = hc_byte_perm (in[0], 0, 0x7170);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm (in[3], 0, 0x07030702);
  out2[2] = hc_byte_perm (in[3], 0, 0x07010700);
  out2[1] = hc_byte_perm (in[2], 0, 0x07030702);
  out2[0] = hc_byte_perm (in[2], 0, 0x07010700);
  out1[3] = hc_byte_perm (in[1], 0, 0x07030702);
  out1[2] = hc_byte_perm (in[1], 0, 0x07010700);
  out1[1] = hc_byte_perm (in[0], 0, 0x07030702);
  out1[0] = hc_byte_perm (in[0], 0, 0x07010700);

  #else

  out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
  out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >>  0) & 0x000000FF);
  out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
  out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >>  0) & 0x000000FF);
  out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
  out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >>  0) & 0x000000FF);
  out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
  out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >>  0) & 0x000000FF);

  #endif
}

DECLSPEC void make_utf16leN (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x7170);
  out2[2] = hc_byte_perm (in[3], 0, 0x7372);
  out2[1] = hc_byte_perm (in[2], 0, 0x7170);
  out2[0] = hc_byte_perm (in[2], 0, 0x7372);
  out1[3] = hc_byte_perm (in[1], 0, 0x7170);
  out1[2] = hc_byte_perm (in[1], 0, 0x7372);
  out1[1] = hc_byte_perm (in[0], 0, 0x7170);
  out1[0] = hc_byte_perm (in[0], 0, 0x7372);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm (in[3], 0, 0x07010700);
  out2[2] = hc_byte_perm (in[3], 0, 0x07030702);
  out2[1] = hc_byte_perm (in[2], 0, 0x07010700);
  out2[0] = hc_byte_perm (in[2], 0, 0x07030702);
  out1[3] = hc_byte_perm (in[1], 0, 0x07010700);
  out1[2] = hc_byte_perm (in[1], 0, 0x07030702);
  out1[1] = hc_byte_perm (in[0], 0, 0x07010700);
  out1[0] = hc_byte_perm (in[0], 0, 0x07030702);

  #else

  out2[3] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >>  0) & 0x000000FF);
  out2[2] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
  out2[1] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >>  0) & 0x000000FF);
  out2[0] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
  out1[3] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >>  0) & 0x000000FF);
  out1[2] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
  out1[1] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >>  0) & 0x000000FF);
  out1[0] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);

  #endif
}

DECLSPEC void undo_utf16be (const u32x *in1, const u32x *in2, u32x *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm (in1[0], in1[1], 0x4602);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x4602);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x4602);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x4602);

  #elif defined IS_AMD && AMD_GCN >= 3

  out[0] = hc_byte_perm (in1[0], in1[1], 0x04060002);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x04060002);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x04060002);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x04060002);

  #else

  out[0] = ((in1[0] & 0x0000ff00) >>  8) | ((in1[0] & 0xff000000) >> 16)
         | ((in1[1] & 0x0000ff00) <<  8) | ((in1[1] & 0xff000000) <<  0);
  out[1] = ((in1[2] & 0x0000ff00) >>  8) | ((in1[2] & 0xff000000) >> 16)
         | ((in1[3] & 0x0000ff00) <<  8) | ((in1[3] & 0xff000000) <<  0);
  out[2] = ((in2[0] & 0x0000ff00) >>  8) | ((in2[0] & 0xff000000) >> 16)
         | ((in2[1] & 0x0000ff00) <<  8) | ((in2[1] & 0xff000000) <<  0);
  out[3] = ((in2[2] & 0x0000ff00) >>  8) | ((in2[2] & 0xff000000) >> 16)
         | ((in2[3] & 0x0000ff00) <<  8) | ((in2[3] & 0xff000000) <<  0);

  #endif
}

DECLSPEC void undo_utf16le (const u32x *in1, const u32x *in2, u32x *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm (in1[0], in1[1], 0x6420);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x6420);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x6420);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x6420);

  #elif defined IS_AMD && AMD_GCN >= 3

  out[0] = hc_byte_perm (in1[0], in1[1], 0x06040200);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x06040200);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x06040200);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x06040200);

  #else

  out[0] = ((in1[0] & 0x000000ff) >>  0) | ((in1[0] & 0x00ff0000) >>  8)
         | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) <<  8);
  out[1] = ((in1[2] & 0x000000ff) >>  0) | ((in1[2] & 0x00ff0000) >>  8)
         | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) <<  8);
  out[2] = ((in2[0] & 0x000000ff) >>  0) | ((in2[0] & 0x00ff0000) >>  8)
         | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) <<  8);
  out[3] = ((in2[2] & 0x000000ff) >>  0) | ((in2[2] & 0x00ff0000) >>  8)
         | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) <<  8);

  #endif
}

DECLSPEC void set_mark_1x4 (u32 *v, const u32 offset)
{
  const u32 c = (offset & 15) / 4;
  const u32 r = 0xff << ((offset & 3) * 8);

  v[0] = (c == 0) ? r : 0;
  v[1] = (c == 1) ? r : 0;
  v[2] = (c == 2) ? r : 0;
  v[3] = (c == 3) ? r : 0;
}

DECLSPEC void append_helper_1x4 (u32x *r, const u32 v, const u32 *m)
{
  r[0] |= v & m[0];
  r[1] |= v & m[1];
  r[2] |= v & m[2];
  r[3] |= v & m[3];
}

DECLSPEC void append_0x80_1x4 (u32x *w0, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  append_helper_1x4 (w0, 0x80808080, v);
}

DECLSPEC void append_0x80_2x4 (u32x *w0, u32x *w1, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_3x4 (u32x *w0, u32x *w1, u32x *w2, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_4x4 (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_8x4 (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
  append_helper_1x4 (w4, ((offset16 == 4) ? 0x80808080 : 0), v);
  append_helper_1x4 (w5, ((offset16 == 5) ? 0x80808080 : 0), v);
  append_helper_1x4 (w6, ((offset16 == 6) ? 0x80808080 : 0), v);
  append_helper_1x4 (w7, ((offset16 == 7) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_1x16 (u32x *w, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w +  0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w +  4, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w +  8, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w + 12, ((offset16 == 3) ? 0x80808080 : 0), v);
}

DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm (    0, w0[0], selector);

      break;

    case  1:
      w3[3] = hc_byte_perm (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm (    0, w0[0], selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if defined IS_AMD || defined IS_GENERIC
  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign (w3[3],     0, offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign (w3[3],     0, offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign (w3[3],     0, offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign (w3[3],     0, offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign (w3[3],     0, offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign (w3[3],     0, offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign (w3[3],     0, offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign (w3[3],     0, offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign (w3[3],     0, offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign (w3[3],     0, offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w3[3],     0, offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w3[3],     0, offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w3[3],     0, offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w3[3],     0, offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w3[3],     0, offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w3[3],     0, offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);
  c0[0] = swap32 (c0[0]);
  c0[1] = swap32 (c0[1]);
  c0[2] = swap32 (c0[2]);
  c0[3] = swap32 (c0[3]);
  c1[0] = swap32 (c1[0]);
  c1[1] = swap32 (c1[1]);
  c1[2] = swap32 (c1[2]);
  c1[3] = swap32 (c1[3]);
  c2[0] = swap32 (c2[0]);
  c2[1] = swap32 (c2[1]);
  c2[2] = swap32 (c2[2]);
  c2[3] = swap32 (c2[3]);
  c3[0] = swap32 (c3[0]);
  c3[1] = swap32 (c3[1]);
  c3[2] = swap32 (c3[2]);
  c3[3] = swap32 (c3[3]);
  #endif

  #ifdef IS_NV
  // todo
  switch (offset_switch)
  {
    case 0:
      c0[0] = hc_bytealign (    0, w3[3], offset_minus_4);
      w3[3] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      w3[2] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      w3[1] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      w3[0] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      w2[3] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w2[2] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w2[1] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w2[0] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w1[3] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w1[2] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w1[1] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w1[0] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w0[3] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w0[2] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w0[1] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w0[0] = hc_bytealign (w0[0],     0, offset_minus_4);

      if (offset_mod_4 == 0)
      {
        w0[0] = w0[1];
        w0[1] = w0[2];
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = 0;
      }

      break;

    case 1:
      c0[1] = hc_bytealign (    0, w3[3], offset_minus_4);
      c0[0] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      w3[3] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      w3[2] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      w3[1] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      w3[0] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w2[3] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w2[2] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w2[1] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w2[0] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w1[3] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w1[2] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w1[1] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w1[0] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w0[3] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w0[2] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w0[1] = hc_bytealign (w0[0],     0, offset_minus_4);
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[1] = w0[2];
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = 0;
      }

      break;

    case 2:
      c0[2] = hc_bytealign (    0, w3[3], offset_minus_4);
      c0[1] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c0[0] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      w3[3] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      w3[2] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      w3[1] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w3[0] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w2[3] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w2[2] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w2[1] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w2[0] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w1[3] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w1[2] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w1[1] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w1[0] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w0[3] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w0[2] = hc_bytealign (w0[0],     0, offset_minus_4);
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = 0;
      }

      break;

    case 3:
      c0[3] = hc_bytealign (    0, w3[3], offset_minus_4);
      c0[2] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c0[1] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c0[0] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      w3[3] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      w3[2] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w3[1] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w3[0] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w2[3] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w2[2] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w2[1] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w2[0] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w1[3] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w1[2] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w1[1] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w1[0] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w0[3] = hc_bytealign (w0[0],     0, offset_minus_4);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = 0;
      }

      break;

    case 4:
      c1[0] = hc_bytealign (    0, w3[3], offset_minus_4);
      c0[3] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c0[2] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c0[1] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c0[0] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      w3[3] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w3[2] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w3[1] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w3[0] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w2[3] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w2[2] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w2[1] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w2[0] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w1[3] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w1[2] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w1[1] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w1[0] = hc_bytealign (w0[0],     0, offset_minus_4);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = 0;
      }

      break;

    case 5:
      c1[1] = hc_bytealign (    0, w3[3], offset_minus_4);
      c1[0] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c0[3] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c0[2] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c0[1] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c0[0] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      w3[3] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w3[2] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w3[1] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w3[0] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w2[3] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w2[2] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w2[1] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w2[0] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w1[3] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w1[2] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w1[1] = hc_bytealign (w0[0],     0, offset_minus_4);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = 0;
      }

      break;

    case 6:
      c1[2] = hc_bytealign (    0, w3[3], offset_minus_4);
      c1[1] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c1[0] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c0[3] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c0[2] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c0[1] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c0[0] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      w3[3] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w3[2] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w3[1] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w3[0] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w2[3] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w2[2] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w2[1] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w2[0] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w1[3] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w1[2] = hc_bytealign (w0[0],     0, offset_minus_4);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = 0;
      }

      break;

    case 7:
      c1[3] = hc_bytealign (    0, w3[3], offset_minus_4);
      c1[2] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c1[1] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c1[0] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c0[3] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c0[2] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c0[1] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c0[0] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      w3[3] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w3[2] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w3[1] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w3[0] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w2[3] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w2[2] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w2[1] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w2[0] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w1[3] = hc_bytealign (w0[0],     0, offset_minus_4);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = 0;
      }

      break;

    case 8:
      c2[0] = hc_bytealign (    0, w3[3], offset_minus_4);
      c1[3] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c1[2] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c1[1] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c1[0] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c0[3] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c0[2] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c0[1] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c0[0] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      w3[3] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w3[2] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w3[1] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w3[0] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w2[3] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w2[2] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w2[1] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w2[0] = hc_bytealign (w0[0],     0, offset_minus_4);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = 0;
      }

      break;

    case 9:
      c2[1] = hc_bytealign (    0, w3[3], offset_minus_4);
      c2[0] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c1[3] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c1[2] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c1[1] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c1[0] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c0[3] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c0[2] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c0[1] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c0[0] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      w3[3] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w3[2] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w3[1] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w3[0] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w2[3] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w2[2] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w2[1] = hc_bytealign (w0[0],     0, offset_minus_4);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = 0;
      }

      break;

    case 10:
      c2[2] = hc_bytealign (    0, w3[3], offset_minus_4);
      c2[1] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c2[0] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c1[3] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c1[2] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c1[1] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c1[0] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c0[3] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c0[2] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c0[1] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c0[0] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      w3[3] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w3[2] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w3[1] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w3[0] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w2[3] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w2[2] = hc_bytealign (w0[0],     0, offset_minus_4);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = 0;
      }

      break;

    case 11:
      c2[3] = hc_bytealign (    0, w3[3], offset_minus_4);
      c2[2] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c2[1] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c2[0] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c1[3] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c1[2] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c1[1] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c1[0] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c0[3] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c0[2] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c0[1] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      c0[0] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      w3[3] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w3[2] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w3[1] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w3[0] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w2[3] = hc_bytealign (w0[0],     0, offset_minus_4);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = 0;
      }

      break;

    case 12:
      c3[0] = hc_bytealign (    0, w3[3], offset_minus_4);
      c2[3] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c2[2] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c2[1] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c2[0] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c1[3] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c1[2] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c1[1] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c1[0] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c0[3] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c0[2] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      c0[1] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      c0[0] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      w3[3] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w3[2] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w3[1] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w3[0] = hc_bytealign (w0[0],     0, offset_minus_4);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = 0;
      }

      break;

    case 13:
      c3[1] = hc_bytealign (    0, w3[3], offset_minus_4);
      c3[0] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c2[3] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c2[2] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c2[1] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c2[0] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c1[3] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c1[2] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c1[1] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c1[0] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c0[3] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      c0[2] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      c0[1] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      c0[0] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      w3[3] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w3[2] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w3[1] = hc_bytealign (w0[0],     0, offset_minus_4);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = 0;
      }

      break;

    case 14:
      c3[2] = hc_bytealign (    0, w3[3], offset_minus_4);
      c3[1] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c3[0] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c2[3] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c2[2] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c2[1] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c2[0] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c1[3] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c1[2] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c1[1] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c1[0] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      c0[3] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      c0[2] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      c0[1] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      c0[0] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      w3[3] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w3[2] = hc_bytealign (w0[0],     0, offset_minus_4);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = c3[2];
        c3[2] = 0;
      }

      break;

    case 15:
      c3[3] = hc_bytealign (    0, w3[3], offset_minus_4);
      c3[2] = hc_bytealign (w3[3], w3[2], offset_minus_4);
      c3[1] = hc_bytealign (w3[2], w3[1], offset_minus_4);
      c3[0] = hc_bytealign (w3[1], w3[0], offset_minus_4);
      c2[3] = hc_bytealign (w3[0], w2[3], offset_minus_4);
      c2[2] = hc_bytealign (w2[3], w2[2], offset_minus_4);
      c2[1] = hc_bytealign (w2[2], w2[1], offset_minus_4);
      c2[0] = hc_bytealign (w2[1], w2[0], offset_minus_4);
      c1[3] = hc_bytealign (w2[0], w1[3], offset_minus_4);
      c1[2] = hc_bytealign (w1[3], w1[2], offset_minus_4);
      c1[1] = hc_bytealign (w1[2], w1[1], offset_minus_4);
      c1[0] = hc_bytealign (w1[1], w1[0], offset_minus_4);
      c0[3] = hc_bytealign (w1[0], w0[3], offset_minus_4);
      c0[2] = hc_bytealign (w0[3], w0[2], offset_minus_4);
      c0[1] = hc_bytealign (w0[2], w0[1], offset_minus_4);
      c0[0] = hc_bytealign (w0[1], w0[0], offset_minus_4);
      w3[3] = hc_bytealign (w0[0],     0, offset_minus_4);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = c3[2];
        c3[2] = c3[3];
        c3[3] = 0;
      }

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign (w3[3],     0, offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign (w3[3],     0, offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign (w3[3],     0, offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign (w3[3],     0, offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign (w3[3],     0, offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign (w3[3],     0, offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign (w3[3],     0, offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign (w3[3],     0, offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign (w3[3],     0, offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign (w3[3],     0, offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w3[3],     0, offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w3[3],     0, offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w3[3],     0, offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w3[3],     0, offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w3[3],     0, offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w3[3],     0, offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm (    0, w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm (    0, w3[3], selector);
      c0[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm (    0, w3[3], selector);
      c0[1] = hc_byte_perm (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm (    0, w3[3], selector);
      c0[2] = hc_byte_perm (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm (    0, w3[3], selector);
      c0[3] = hc_byte_perm (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm (    0, w3[3], selector);
      c1[0] = hc_byte_perm (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm (    0, w3[3], selector);
      c1[1] = hc_byte_perm (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm (    0, w3[3], selector);
      c1[2] = hc_byte_perm (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm (    0, w3[3], selector);
      c1[3] = hc_byte_perm (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm (    0, w3[3], selector);
      c2[0] = hc_byte_perm (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm (    0, w3[3], selector);
      c2[1] = hc_byte_perm (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm (    0, w3[3], selector);
      c2[2] = hc_byte_perm (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm (    0, w3[3], selector);
      c2[3] = hc_byte_perm (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm (    0, w3[3], selector);
      c3[0] = hc_byte_perm (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm (    0, w3[3], selector);
      c3[1] = hc_byte_perm (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm (    0, w3[3], selector);
      c3[2] = hc_byte_perm (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);
  w4[0] = swap32 (w4[0]);
  w4[1] = swap32 (w4[1]);
  w4[2] = swap32 (w4[2]);
  w4[3] = swap32 (w4[3]);
  w5[0] = swap32 (w5[0]);
  w5[1] = swap32 (w5[1]);
  w5[2] = swap32 (w5[2]);
  w5[3] = swap32 (w5[3]);
  w6[0] = swap32 (w6[0]);
  w6[1] = swap32 (w6[1]);
  w6[2] = swap32 (w6[2]);
  w6[3] = swap32 (w6[3]);
  w7[0] = swap32 (w7[0]);
  w7[1] = swap32 (w7[1]);
  w7[2] = swap32 (w7[2]);
  w7[3] = swap32 (w7[3]);

  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign (w7[2], w7[3], offset);
      w7[2] = hc_bytealign (w7[1], w7[2], offset);
      w7[1] = hc_bytealign (w7[0], w7[1], offset);
      w7[0] = hc_bytealign (w6[3], w7[0], offset);
      w6[3] = hc_bytealign (w6[2], w6[3], offset);
      w6[2] = hc_bytealign (w6[1], w6[2], offset);
      w6[1] = hc_bytealign (w6[0], w6[1], offset);
      w6[0] = hc_bytealign (w5[3], w6[0], offset);
      w5[3] = hc_bytealign (w5[2], w5[3], offset);
      w5[2] = hc_bytealign (w5[1], w5[2], offset);
      w5[1] = hc_bytealign (w5[0], w5[1], offset);
      w5[0] = hc_bytealign (w4[3], w5[0], offset);
      w4[3] = hc_bytealign (w4[2], w4[3], offset);
      w4[2] = hc_bytealign (w4[1], w4[2], offset);
      w4[1] = hc_bytealign (w4[0], w4[1], offset);
      w4[0] = hc_bytealign (w3[3], w4[0], offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign (w7[1], w7[2], offset);
      w7[2] = hc_bytealign (w7[0], w7[1], offset);
      w7[1] = hc_bytealign (w6[3], w7[0], offset);
      w7[0] = hc_bytealign (w6[2], w6[3], offset);
      w6[3] = hc_bytealign (w6[1], w6[2], offset);
      w6[2] = hc_bytealign (w6[0], w6[1], offset);
      w6[1] = hc_bytealign (w5[3], w6[0], offset);
      w6[0] = hc_bytealign (w5[2], w5[3], offset);
      w5[3] = hc_bytealign (w5[1], w5[2], offset);
      w5[2] = hc_bytealign (w5[0], w5[1], offset);
      w5[1] = hc_bytealign (w4[3], w5[0], offset);
      w5[0] = hc_bytealign (w4[2], w4[3], offset);
      w4[3] = hc_bytealign (w4[1], w4[2], offset);
      w4[2] = hc_bytealign (w4[0], w4[1], offset);
      w4[1] = hc_bytealign (w3[3], w4[0], offset);
      w4[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign (w7[0], w7[1], offset);
      w7[2] = hc_bytealign (w6[3], w7[0], offset);
      w7[1] = hc_bytealign (w6[2], w6[3], offset);
      w7[0] = hc_bytealign (w6[1], w6[2], offset);
      w6[3] = hc_bytealign (w6[0], w6[1], offset);
      w6[2] = hc_bytealign (w5[3], w6[0], offset);
      w6[1] = hc_bytealign (w5[2], w5[3], offset);
      w6[0] = hc_bytealign (w5[1], w5[2], offset);
      w5[3] = hc_bytealign (w5[0], w5[1], offset);
      w5[2] = hc_bytealign (w4[3], w5[0], offset);
      w5[1] = hc_bytealign (w4[2], w4[3], offset);
      w5[0] = hc_bytealign (w4[1], w4[2], offset);
      w4[3] = hc_bytealign (w4[0], w4[1], offset);
      w4[2] = hc_bytealign (w3[3], w4[0], offset);
      w4[1] = hc_bytealign (w3[2], w3[3], offset);
      w4[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign (w6[3], w7[0], offset);
      w7[2] = hc_bytealign (w6[2], w6[3], offset);
      w7[1] = hc_bytealign (w6[1], w6[2], offset);
      w7[0] = hc_bytealign (w6[0], w6[1], offset);
      w6[3] = hc_bytealign (w5[3], w6[0], offset);
      w6[2] = hc_bytealign (w5[2], w5[3], offset);
      w6[1] = hc_bytealign (w5[1], w5[2], offset);
      w6[0] = hc_bytealign (w5[0], w5[1], offset);
      w5[3] = hc_bytealign (w4[3], w5[0], offset);
      w5[2] = hc_bytealign (w4[2], w4[3], offset);
      w5[1] = hc_bytealign (w4[1], w4[2], offset);
      w5[0] = hc_bytealign (w4[0], w4[1], offset);
      w4[3] = hc_bytealign (w3[3], w4[0], offset);
      w4[2] = hc_bytealign (w3[2], w3[3], offset);
      w4[1] = hc_bytealign (w3[1], w3[2], offset);
      w4[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign (w6[2], w6[3], offset);
      w7[2] = hc_bytealign (w6[1], w6[2], offset);
      w7[1] = hc_bytealign (w6[0], w6[1], offset);
      w7[0] = hc_bytealign (w5[3], w6[0], offset);
      w6[3] = hc_bytealign (w5[2], w5[3], offset);
      w6[2] = hc_bytealign (w5[1], w5[2], offset);
      w6[1] = hc_bytealign (w5[0], w5[1], offset);
      w6[0] = hc_bytealign (w4[3], w5[0], offset);
      w5[3] = hc_bytealign (w4[2], w4[3], offset);
      w5[2] = hc_bytealign (w4[1], w4[2], offset);
      w5[1] = hc_bytealign (w4[0], w4[1], offset);
      w5[0] = hc_bytealign (w3[3], w4[0], offset);
      w4[3] = hc_bytealign (w3[2], w3[3], offset);
      w4[2] = hc_bytealign (w3[1], w3[2], offset);
      w4[1] = hc_bytealign (w3[0], w3[1], offset);
      w4[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign (w6[1], w6[2], offset);
      w7[2] = hc_bytealign (w6[0], w6[1], offset);
      w7[1] = hc_bytealign (w5[3], w6[0], offset);
      w7[0] = hc_bytealign (w5[2], w5[3], offset);
      w6[3] = hc_bytealign (w5[1], w5[2], offset);
      w6[2] = hc_bytealign (w5[0], w5[1], offset);
      w6[1] = hc_bytealign (w4[3], w5[0], offset);
      w6[0] = hc_bytealign (w4[2], w4[3], offset);
      w5[3] = hc_bytealign (w4[1], w4[2], offset);
      w5[2] = hc_bytealign (w4[0], w4[1], offset);
      w5[1] = hc_bytealign (w3[3], w4[0], offset);
      w5[0] = hc_bytealign (w3[2], w3[3], offset);
      w4[3] = hc_bytealign (w3[1], w3[2], offset);
      w4[2] = hc_bytealign (w3[0], w3[1], offset);
      w4[1] = hc_bytealign (w2[3], w3[0], offset);
      w4[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign (w6[0], w6[1], offset);
      w7[2] = hc_bytealign (w5[3], w6[0], offset);
      w7[1] = hc_bytealign (w5[2], w5[3], offset);
      w7[0] = hc_bytealign (w5[1], w5[2], offset);
      w6[3] = hc_bytealign (w5[0], w5[1], offset);
      w6[2] = hc_bytealign (w4[3], w5[0], offset);
      w6[1] = hc_bytealign (w4[2], w4[3], offset);
      w6[0] = hc_bytealign (w4[1], w4[2], offset);
      w5[3] = hc_bytealign (w4[0], w4[1], offset);
      w5[2] = hc_bytealign (w3[3], w4[0], offset);
      w5[1] = hc_bytealign (w3[2], w3[3], offset);
      w5[0] = hc_bytealign (w3[1], w3[2], offset);
      w4[3] = hc_bytealign (w3[0], w3[1], offset);
      w4[2] = hc_bytealign (w2[3], w3[0], offset);
      w4[1] = hc_bytealign (w2[2], w2[3], offset);
      w4[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign (w5[3], w6[0], offset);
      w7[2] = hc_bytealign (w5[2], w5[3], offset);
      w7[1] = hc_bytealign (w5[1], w5[2], offset);
      w7[0] = hc_bytealign (w5[0], w5[1], offset);
      w6[3] = hc_bytealign (w4[3], w5[0], offset);
      w6[2] = hc_bytealign (w4[2], w4[3], offset);
      w6[1] = hc_bytealign (w4[1], w4[2], offset);
      w6[0] = hc_bytealign (w4[0], w4[1], offset);
      w5[3] = hc_bytealign (w3[3], w4[0], offset);
      w5[2] = hc_bytealign (w3[2], w3[3], offset);
      w5[1] = hc_bytealign (w3[1], w3[2], offset);
      w5[0] = hc_bytealign (w3[0], w3[1], offset);
      w4[3] = hc_bytealign (w2[3], w3[0], offset);
      w4[2] = hc_bytealign (w2[2], w2[3], offset);
      w4[1] = hc_bytealign (w2[1], w2[2], offset);
      w4[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign (w5[2], w5[3], offset);
      w7[2] = hc_bytealign (w5[1], w5[2], offset);
      w7[1] = hc_bytealign (w5[0], w5[1], offset);
      w7[0] = hc_bytealign (w4[3], w5[0], offset);
      w6[3] = hc_bytealign (w4[2], w4[3], offset);
      w6[2] = hc_bytealign (w4[1], w4[2], offset);
      w6[1] = hc_bytealign (w4[0], w4[1], offset);
      w6[0] = hc_bytealign (w3[3], w4[0], offset);
      w5[3] = hc_bytealign (w3[2], w3[3], offset);
      w5[2] = hc_bytealign (w3[1], w3[2], offset);
      w5[1] = hc_bytealign (w3[0], w3[1], offset);
      w5[0] = hc_bytealign (w2[3], w3[0], offset);
      w4[3] = hc_bytealign (w2[2], w2[3], offset);
      w4[2] = hc_bytealign (w2[1], w2[2], offset);
      w4[1] = hc_bytealign (w2[0], w2[1], offset);
      w4[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign (w5[1], w5[2], offset);
      w7[2] = hc_bytealign (w5[0], w5[1], offset);
      w7[1] = hc_bytealign (w4[3], w5[0], offset);
      w7[0] = hc_bytealign (w4[2], w4[3], offset);
      w6[3] = hc_bytealign (w4[1], w4[2], offset);
      w6[2] = hc_bytealign (w4[0], w4[1], offset);
      w6[1] = hc_bytealign (w3[3], w4[0], offset);
      w6[0] = hc_bytealign (w3[2], w3[3], offset);
      w5[3] = hc_bytealign (w3[1], w3[2], offset);
      w5[2] = hc_bytealign (w3[0], w3[1], offset);
      w5[1] = hc_bytealign (w2[3], w3[0], offset);
      w5[0] = hc_bytealign (w2[2], w2[3], offset);
      w4[3] = hc_bytealign (w2[1], w2[2], offset);
      w4[2] = hc_bytealign (w2[0], w2[1], offset);
      w4[1] = hc_bytealign (w1[3], w2[0], offset);
      w4[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign (w5[0], w5[1], offset);
      w7[2] = hc_bytealign (w4[3], w5[0], offset);
      w7[1] = hc_bytealign (w4[2], w4[3], offset);
      w7[0] = hc_bytealign (w4[1], w4[2], offset);
      w6[3] = hc_bytealign (w4[0], w4[1], offset);
      w6[2] = hc_bytealign (w3[3], w4[0], offset);
      w6[1] = hc_bytealign (w3[2], w3[3], offset);
      w6[0] = hc_bytealign (w3[1], w3[2], offset);
      w5[3] = hc_bytealign (w3[0], w3[1], offset);
      w5[2] = hc_bytealign (w2[3], w3[0], offset);
      w5[1] = hc_bytealign (w2[2], w2[3], offset);
      w5[0] = hc_bytealign (w2[1], w2[2], offset);
      w4[3] = hc_bytealign (w2[0], w2[1], offset);
      w4[2] = hc_bytealign (w1[3], w2[0], offset);
      w4[1] = hc_bytealign (w1[2], w1[3], offset);
      w4[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign (w4[3], w5[0], offset);
      w7[2] = hc_bytealign (w4[2], w4[3], offset);
      w7[1] = hc_bytealign (w4[1], w4[2], offset);
      w7[0] = hc_bytealign (w4[0], w4[1], offset);
      w6[3] = hc_bytealign (w3[3], w4[0], offset);
      w6[2] = hc_bytealign (w3[2], w3[3], offset);
      w6[1] = hc_bytealign (w3[1], w3[2], offset);
      w6[0] = hc_bytealign (w3[0], w3[1], offset);
      w5[3] = hc_bytealign (w2[3], w3[0], offset);
      w5[2] = hc_bytealign (w2[2], w2[3], offset);
      w5[1] = hc_bytealign (w2[1], w2[2], offset);
      w5[0] = hc_bytealign (w2[0], w2[1], offset);
      w4[3] = hc_bytealign (w1[3], w2[0], offset);
      w4[2] = hc_bytealign (w1[2], w1[3], offset);
      w4[1] = hc_bytealign (w1[1], w1[2], offset);
      w4[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign (w4[2], w4[3], offset);
      w7[2] = hc_bytealign (w4[1], w4[2], offset);
      w7[1] = hc_bytealign (w4[0], w4[1], offset);
      w7[0] = hc_bytealign (w3[3], w4[0], offset);
      w6[3] = hc_bytealign (w3[2], w3[3], offset);
      w6[2] = hc_bytealign (w3[1], w3[2], offset);
      w6[1] = hc_bytealign (w3[0], w3[1], offset);
      w6[0] = hc_bytealign (w2[3], w3[0], offset);
      w5[3] = hc_bytealign (w2[2], w2[3], offset);
      w5[2] = hc_bytealign (w2[1], w2[2], offset);
      w5[1] = hc_bytealign (w2[0], w2[1], offset);
      w5[0] = hc_bytealign (w1[3], w2[0], offset);
      w4[3] = hc_bytealign (w1[2], w1[3], offset);
      w4[2] = hc_bytealign (w1[1], w1[2], offset);
      w4[1] = hc_bytealign (w1[0], w1[1], offset);
      w4[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign (w4[1], w4[2], offset);
      w7[2] = hc_bytealign (w4[0], w4[1], offset);
      w7[1] = hc_bytealign (w3[3], w4[0], offset);
      w7[0] = hc_bytealign (w3[2], w3[3], offset);
      w6[3] = hc_bytealign (w3[1], w3[2], offset);
      w6[2] = hc_bytealign (w3[0], w3[1], offset);
      w6[1] = hc_bytealign (w2[3], w3[0], offset);
      w6[0] = hc_bytealign (w2[2], w2[3], offset);
      w5[3] = hc_bytealign (w2[1], w2[2], offset);
      w5[2] = hc_bytealign (w2[0], w2[1], offset);
      w5[1] = hc_bytealign (w1[3], w2[0], offset);
      w5[0] = hc_bytealign (w1[2], w1[3], offset);
      w4[3] = hc_bytealign (w1[1], w1[2], offset);
      w4[2] = hc_bytealign (w1[0], w1[1], offset);
      w4[1] = hc_bytealign (w0[3], w1[0], offset);
      w4[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign (w4[0], w4[1], offset);
      w7[2] = hc_bytealign (w3[3], w4[0], offset);
      w7[1] = hc_bytealign (w3[2], w3[3], offset);
      w7[0] = hc_bytealign (w3[1], w3[2], offset);
      w6[3] = hc_bytealign (w3[0], w3[1], offset);
      w6[2] = hc_bytealign (w2[3], w3[0], offset);
      w6[1] = hc_bytealign (w2[2], w2[3], offset);
      w6[0] = hc_bytealign (w2[1], w2[2], offset);
      w5[3] = hc_bytealign (w2[0], w2[1], offset);
      w5[2] = hc_bytealign (w1[3], w2[0], offset);
      w5[1] = hc_bytealign (w1[2], w1[3], offset);
      w5[0] = hc_bytealign (w1[1], w1[2], offset);
      w4[3] = hc_bytealign (w1[0], w1[1], offset);
      w4[2] = hc_bytealign (w0[3], w1[0], offset);
      w4[1] = hc_bytealign (w0[2], w0[3], offset);
      w4[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign (w3[3], w4[0], offset);
      w7[2] = hc_bytealign (w3[2], w3[3], offset);
      w7[1] = hc_bytealign (w3[1], w3[2], offset);
      w7[0] = hc_bytealign (w3[0], w3[1], offset);
      w6[3] = hc_bytealign (w2[3], w3[0], offset);
      w6[2] = hc_bytealign (w2[2], w2[3], offset);
      w6[1] = hc_bytealign (w2[1], w2[2], offset);
      w6[0] = hc_bytealign (w2[0], w2[1], offset);
      w5[3] = hc_bytealign (w1[3], w2[0], offset);
      w5[2] = hc_bytealign (w1[2], w1[3], offset);
      w5[1] = hc_bytealign (w1[1], w1[2], offset);
      w5[0] = hc_bytealign (w1[0], w1[1], offset);
      w4[3] = hc_bytealign (w0[3], w1[0], offset);
      w4[2] = hc_bytealign (w0[2], w0[3], offset);
      w4[1] = hc_bytealign (w0[1], w0[2], offset);
      w4[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign (w3[2], w3[3], offset);
      w7[2] = hc_bytealign (w3[1], w3[2], offset);
      w7[1] = hc_bytealign (w3[0], w3[1], offset);
      w7[0] = hc_bytealign (w2[3], w3[0], offset);
      w6[3] = hc_bytealign (w2[2], w2[3], offset);
      w6[2] = hc_bytealign (w2[1], w2[2], offset);
      w6[1] = hc_bytealign (w2[0], w2[1], offset);
      w6[0] = hc_bytealign (w1[3], w2[0], offset);
      w5[3] = hc_bytealign (w1[2], w1[3], offset);
      w5[2] = hc_bytealign (w1[1], w1[2], offset);
      w5[1] = hc_bytealign (w1[0], w1[1], offset);
      w5[0] = hc_bytealign (w0[3], w1[0], offset);
      w4[3] = hc_bytealign (w0[2], w0[3], offset);
      w4[2] = hc_bytealign (w0[1], w0[2], offset);
      w4[1] = hc_bytealign (w0[0], w0[1], offset);
      w4[0] = hc_bytealign (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign (w3[1], w3[2], offset);
      w7[2] = hc_bytealign (w3[0], w3[1], offset);
      w7[1] = hc_bytealign (w2[3], w3[0], offset);
      w7[0] = hc_bytealign (w2[2], w2[3], offset);
      w6[3] = hc_bytealign (w2[1], w2[2], offset);
      w6[2] = hc_bytealign (w2[0], w2[1], offset);
      w6[1] = hc_bytealign (w1[3], w2[0], offset);
      w6[0] = hc_bytealign (w1[2], w1[3], offset);
      w5[3] = hc_bytealign (w1[1], w1[2], offset);
      w5[2] = hc_bytealign (w1[0], w1[1], offset);
      w5[1] = hc_bytealign (w0[3], w1[0], offset);
      w5[0] = hc_bytealign (w0[2], w0[3], offset);
      w4[3] = hc_bytealign (w0[1], w0[2], offset);
      w4[2] = hc_bytealign (w0[0], w0[1], offset);
      w4[1] = hc_bytealign (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign (w3[0], w3[1], offset);
      w7[2] = hc_bytealign (w2[3], w3[0], offset);
      w7[1] = hc_bytealign (w2[2], w2[3], offset);
      w7[0] = hc_bytealign (w2[1], w2[2], offset);
      w6[3] = hc_bytealign (w2[0], w2[1], offset);
      w6[2] = hc_bytealign (w1[3], w2[0], offset);
      w6[1] = hc_bytealign (w1[2], w1[3], offset);
      w6[0] = hc_bytealign (w1[1], w1[2], offset);
      w5[3] = hc_bytealign (w1[0], w1[1], offset);
      w5[2] = hc_bytealign (w0[3], w1[0], offset);
      w5[1] = hc_bytealign (w0[2], w0[3], offset);
      w5[0] = hc_bytealign (w0[1], w0[2], offset);
      w4[3] = hc_bytealign (w0[0], w0[1], offset);
      w4[2] = hc_bytealign (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign (w2[3], w3[0], offset);
      w7[2] = hc_bytealign (w2[2], w2[3], offset);
      w7[1] = hc_bytealign (w2[1], w2[2], offset);
      w7[0] = hc_bytealign (w2[0], w2[1], offset);
      w6[3] = hc_bytealign (w1[3], w2[0], offset);
      w6[2] = hc_bytealign (w1[2], w1[3], offset);
      w6[1] = hc_bytealign (w1[1], w1[2], offset);
      w6[0] = hc_bytealign (w1[0], w1[1], offset);
      w5[3] = hc_bytealign (w0[3], w1[0], offset);
      w5[2] = hc_bytealign (w0[2], w0[3], offset);
      w5[1] = hc_bytealign (w0[1], w0[2], offset);
      w5[0] = hc_bytealign (w0[0], w0[1], offset);
      w4[3] = hc_bytealign (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign (w2[2], w2[3], offset);
      w7[2] = hc_bytealign (w2[1], w2[2], offset);
      w7[1] = hc_bytealign (w2[0], w2[1], offset);
      w7[0] = hc_bytealign (w1[3], w2[0], offset);
      w6[3] = hc_bytealign (w1[2], w1[3], offset);
      w6[2] = hc_bytealign (w1[1], w1[2], offset);
      w6[1] = hc_bytealign (w1[0], w1[1], offset);
      w6[0] = hc_bytealign (w0[3], w1[0], offset);
      w5[3] = hc_bytealign (w0[2], w0[3], offset);
      w5[2] = hc_bytealign (w0[1], w0[2], offset);
      w5[1] = hc_bytealign (w0[0], w0[1], offset);
      w5[0] = hc_bytealign (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign (w2[1], w2[2], offset);
      w7[2] = hc_bytealign (w2[0], w2[1], offset);
      w7[1] = hc_bytealign (w1[3], w2[0], offset);
      w7[0] = hc_bytealign (w1[2], w1[3], offset);
      w6[3] = hc_bytealign (w1[1], w1[2], offset);
      w6[2] = hc_bytealign (w1[0], w1[1], offset);
      w6[1] = hc_bytealign (w0[3], w1[0], offset);
      w6[0] = hc_bytealign (w0[2], w0[3], offset);
      w5[3] = hc_bytealign (w0[1], w0[2], offset);
      w5[2] = hc_bytealign (w0[0], w0[1], offset);
      w5[1] = hc_bytealign (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign (w2[0], w2[1], offset);
      w7[2] = hc_bytealign (w1[3], w2[0], offset);
      w7[1] = hc_bytealign (w1[2], w1[3], offset);
      w7[0] = hc_bytealign (w1[1], w1[2], offset);
      w6[3] = hc_bytealign (w1[0], w1[1], offset);
      w6[2] = hc_bytealign (w0[3], w1[0], offset);
      w6[1] = hc_bytealign (w0[2], w0[3], offset);
      w6[0] = hc_bytealign (w0[1], w0[2], offset);
      w5[3] = hc_bytealign (w0[0], w0[1], offset);
      w5[2] = hc_bytealign (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign (w1[3], w2[0], offset);
      w7[2] = hc_bytealign (w1[2], w1[3], offset);
      w7[1] = hc_bytealign (w1[1], w1[2], offset);
      w7[0] = hc_bytealign (w1[0], w1[1], offset);
      w6[3] = hc_bytealign (w0[3], w1[0], offset);
      w6[2] = hc_bytealign (w0[2], w0[3], offset);
      w6[1] = hc_bytealign (w0[1], w0[2], offset);
      w6[0] = hc_bytealign (w0[0], w0[1], offset);
      w5[3] = hc_bytealign (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign (w1[2], w1[3], offset);
      w7[2] = hc_bytealign (w1[1], w1[2], offset);
      w7[1] = hc_bytealign (w1[0], w1[1], offset);
      w7[0] = hc_bytealign (w0[3], w1[0], offset);
      w6[3] = hc_bytealign (w0[2], w0[3], offset);
      w6[2] = hc_bytealign (w0[1], w0[2], offset);
      w6[1] = hc_bytealign (w0[0], w0[1], offset);
      w6[0] = hc_bytealign (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign (w1[1], w1[2], offset);
      w7[2] = hc_bytealign (w1[0], w1[1], offset);
      w7[1] = hc_bytealign (w0[3], w1[0], offset);
      w7[0] = hc_bytealign (w0[2], w0[3], offset);
      w6[3] = hc_bytealign (w0[1], w0[2], offset);
      w6[2] = hc_bytealign (w0[0], w0[1], offset);
      w6[1] = hc_bytealign (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign (w1[0], w1[1], offset);
      w7[2] = hc_bytealign (w0[3], w1[0], offset);
      w7[1] = hc_bytealign (w0[2], w0[3], offset);
      w7[0] = hc_bytealign (w0[1], w0[2], offset);
      w6[3] = hc_bytealign (w0[0], w0[1], offset);
      w6[2] = hc_bytealign (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign (w0[3], w1[0], offset);
      w7[2] = hc_bytealign (w0[2], w0[3], offset);
      w7[1] = hc_bytealign (w0[1], w0[2], offset);
      w7[0] = hc_bytealign (w0[0], w0[1], offset);
      w6[3] = hc_bytealign (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign (w0[2], w0[3], offset);
      w7[2] = hc_bytealign (w0[1], w0[2], offset);
      w7[1] = hc_bytealign (w0[0], w0[1], offset);
      w7[0] = hc_bytealign (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign (w0[1], w0[2], offset);
      w7[2] = hc_bytealign (w0[0], w0[1], offset);
      w7[1] = hc_bytealign (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign (w0[0], w0[1], offset);
      w7[2] = hc_bytealign (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32 (w0[0]);
  w0[1] = swap32 (w0[1]);
  w0[2] = swap32 (w0[2]);
  w0[3] = swap32 (w0[3]);
  w1[0] = swap32 (w1[0]);
  w1[1] = swap32 (w1[1]);
  w1[2] = swap32 (w1[2]);
  w1[3] = swap32 (w1[3]);
  w2[0] = swap32 (w2[0]);
  w2[1] = swap32 (w2[1]);
  w2[2] = swap32 (w2[2]);
  w2[3] = swap32 (w2[3]);
  w3[0] = swap32 (w3[0]);
  w3[1] = swap32 (w3[1]);
  w3[2] = swap32 (w3[2]);
  w3[3] = swap32 (w3[3]);
  w4[0] = swap32 (w4[0]);
  w4[1] = swap32 (w4[1]);
  w4[2] = swap32 (w4[2]);
  w4[3] = swap32 (w4[3]);
  w5[0] = swap32 (w5[0]);
  w5[1] = swap32 (w5[1]);
  w5[2] = swap32 (w5[2]);
  w5[3] = swap32 (w5[3]);
  w6[0] = swap32 (w6[0]);
  w6[1] = swap32 (w6[1]);
  w6[2] = swap32 (w6[2]);
  w6[3] = swap32 (w6[3]);
  w7[0] = swap32 (w7[0]);
  w7[1] = swap32 (w7[1]);
  w7[2] = swap32 (w7[2]);
  w7[3] = swap32 (w7[3]);
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case 0:
      w7[3] = hc_byte_perm (w7[2], w7[3], selector);
      w7[2] = hc_byte_perm (w7[1], w7[2], selector);
      w7[1] = hc_byte_perm (w7[0], w7[1], selector);
      w7[0] = hc_byte_perm (w6[3], w7[0], selector);
      w6[3] = hc_byte_perm (w6[2], w6[3], selector);
      w6[2] = hc_byte_perm (w6[1], w6[2], selector);
      w6[1] = hc_byte_perm (w6[0], w6[1], selector);
      w6[0] = hc_byte_perm (w5[3], w6[0], selector);
      w5[3] = hc_byte_perm (w5[2], w5[3], selector);
      w5[2] = hc_byte_perm (w5[1], w5[2], selector);
      w5[1] = hc_byte_perm (w5[0], w5[1], selector);
      w5[0] = hc_byte_perm (w4[3], w5[0], selector);
      w4[3] = hc_byte_perm (w4[2], w4[3], selector);
      w4[2] = hc_byte_perm (w4[1], w4[2], selector);
      w4[1] = hc_byte_perm (w4[0], w4[1], selector);
      w4[0] = hc_byte_perm (w3[3], w4[0], selector);
      w3[3] = hc_byte_perm (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm (    0, w0[0], selector);
      break;

    case 1:
      w7[3] = hc_byte_perm (w7[1], w7[2], selector);
      w7[2] = hc_byte_perm (w7[0], w7[1], selector);
      w7[1] = hc_byte_perm (w6[3], w7[0], selector);
      w7[0] = hc_byte_perm (w6[2], w6[3], selector);
      w6[3] = hc_byte_perm (w6[1], w6[2], selector);
      w6[2] = hc_byte_perm (w6[0], w6[1], selector);
      w6[1] = hc_byte_perm (w5[3], w6[0], selector);
      w6[0] = hc_byte_perm (w5[2], w5[3], selector);
      w5[3] = hc_byte_perm (w5[1], w5[2], selector);
      w5[2] = hc_byte_perm (w5[0], w5[1], selector);
      w5[1] = hc_byte_perm (w4[3], w5[0], selector);
      w5[0] = hc_byte_perm (w4[2], w4[3], selector);
      w4[3] = hc_byte_perm (w4[1], w4[2], selector);
      w4[2] = hc_byte_perm (w4[0], w4[1], selector);
      w4[1] = hc_byte_perm (w3[3], w4[0], selector);
      w4[0] = hc_byte_perm (w3[2], w3[3], selector);
      w3[3] = hc_byte_perm (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm (    0, w0[0], selector);
      w0[0] = 0;
      break;

    case 2:
      w7[3] = hc_byte_perm (w7[0], w7[1], selector);
      w7[2] = hc_byte_perm (w6[3], w7[0], selector);
      w7[1] = hc_byte_perm (w6[2], w6[3], selector);
      w7[0] = hc_byte_perm (w6[1], w6[2], selector);
      w6[3] = hc_byte_perm (w6[0], w6[1], selector);
      w6[2] = hc_byte_perm (w5[3], w6[0], selector);
      w6[1] = hc_byte_perm (w5[2], w5[3], selector);
      w6[0] = hc_byte_perm (w5[1], w5[2], selector);
      w5[3] = hc_byte_perm (w5[0], w5[1], selector);
      w5[2] = hc_byte_perm (w4[3], w5[0], selector);
      w5[1] = hc_byte_perm (w4[2], w4[3], selector);
      w5[0] = hc_byte_perm (w4[1], w4[2], selector);
      w4[3] = hc_byte_perm (w4[0], w4[1], selector);
      w4[2] = hc_byte_perm (w3[3], w4[0], selector);
      w4[1] = hc_byte_perm (w3[2], w3[3], selector);
      w4[0] = hc_byte_perm (w3[1], w3[2], selector);
      w3[3] = hc_byte_perm (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 3:
      w7[3] = hc_byte_perm (w6[3], w7[0], selector);
      w7[2] = hc_byte_perm (w6[2], w6[3], selector);
      w7[1] = hc_byte_perm (w6[1], w6[2], selector);
      w7[0] = hc_byte_perm (w6[0], w6[1], selector);
      w6[3] = hc_byte_perm (w5[3], w6[0], selector);
      w6[2] = hc_byte_perm (w5[2], w5[3], selector);
      w6[1] = hc_byte_perm (w5[1], w5[2], selector);
      w6[0] = hc_byte_perm (w5[0], w5[1], selector);
      w5[3] = hc_byte_perm (w4[3], w5[0], selector);
      w5[2] = hc_byte_perm (w4[2], w4[3], selector);
      w5[1] = hc_byte_perm (w4[1], w4[2], selector);
      w5[0] = hc_byte_perm (w4[0], w4[1], selector);
      w4[3] = hc_byte_perm (w3[3], w4[0], selector);
      w4[2] = hc_byte_perm (w3[2], w3[3], selector);
      w4[1] = hc_byte_perm (w3[1], w3[2], selector);
      w4[0] = hc_byte_perm (w3[0], w3[1], selector);
      w3[3] = hc_byte_perm (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 4:
      w7[3] = hc_byte_perm (w6[2], w6[3], selector);
      w7[2] = hc_byte_perm (w6[1], w6[2], selector);
      w7[1] = hc_byte_perm (w6[0], w6[1], selector);
      w7[0] = hc_byte_perm (w5[3], w6[0], selector);
      w6[3] = hc_byte_perm (w5[2], w5[3], selector);
      w6[2] = hc_byte_perm (w5[1], w5[2], selector);
      w6[1] = hc_byte_perm (w5[0], w5[1], selector);
      w6[0] = hc_byte_perm (w4[3], w5[0], selector);
      w5[3] = hc_byte_perm (w4[2], w4[3], selector);
      w5[2] = hc_byte_perm (w4[1], w4[2], selector);
      w5[1] = hc_byte_perm (w4[0], w4[1], selector);
      w5[0] = hc_byte_perm (w3[3], w4[0], selector);
      w4[3] = hc_byte_perm (w3[2], w3[3], selector);
      w4[2] = hc_byte_perm (w3[1], w3[2], selector);
      w4[1] = hc_byte_perm (w3[0], w3[1], selector);
      w4[0] = hc_byte_perm (w2[3], w3[0], selector);
      w3[3] = hc_byte_perm (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 5:
      w7[3] = hc_byte_perm (w6[1], w6[2], selector);
      w7[2] = hc_byte_perm (w6[0], w6[1], selector);
      w7[1] = hc_byte_perm (w5[3], w6[0], selector);
      w7[0] = hc_byte_perm (w5[2], w5[3], selector);
      w6[3] = hc_byte_perm (w5[1], w5[2], selector);
      w6[2] = hc_byte_perm (w5[0], w5[1], selector);
      w6[1] = hc_byte_perm (w4[3], w5[0], selector);
      w6[0] = hc_byte_perm (w4[2], w4[3], selector);
      w5[3] = hc_byte_perm (w4[1], w4[2], selector);
      w5[2] = hc_byte_perm (w4[0], w4[1], selector);
      w5[1] = hc_byte_perm (w3[3], w4[0], selector);
      w5[0] = hc_byte_perm (w3[2], w3[3], selector);
      w4[3] = hc_byte_perm (w3[1], w3[2], selector);
      w4[2] = hc_byte_perm (w3[0], w3[1], selector);
      w4[1] = hc_byte_perm (w2[3], w3[0], selector);
      w4[0] = hc_byte_perm (w2[2], w2[3], selector);
      w3[3] = hc_byte_perm (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 6:
      w7[3] = hc_byte_perm (w6[0], w6[1], selector);
      w7[2] = hc_byte_perm (w5[3], w6[0], selector);
      w7[1] = hc_byte_perm (w5[2], w5[3], selector);
      w7[0] = hc_byte_perm (w5[1], w5[2], selector);
      w6[3] = hc_byte_perm (w5[0], w5[1], selector);
      w6[2] = hc_byte_perm (w4[3], w5[0], selector);
      w6[1] = hc_byte_perm (w4[2], w4[3], selector);
      w6[0] = hc_byte_perm (w4[1], w4[2], selector);
      w5[3] = hc_byte_perm (w4[0], w4[1], selector);
      w5[2] = hc_byte_perm (w3[3], w4[0], selector);
      w5[1] = hc_byte_perm (w3[2], w3[3], selector);
      w5[0] = hc_byte_perm (w3[1], w3[2], selector);
      w4[3] = hc_byte_perm (w3[0], w3[1], selector);
      w4[2] = hc_byte_perm (w2[3], w3[0], selector);
      w4[1] = hc_byte_perm (w2[2], w2[3], selector);
      w4[0] = hc_byte_perm (w2[1], w2[2], selector);
      w3[3] = hc_byte_perm (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 7:
      w7[3] = hc_byte_perm (w5[3], w6[0], selector);
      w7[2] = hc_byte_perm (w5[2], w5[3], selector);
      w7[1] = hc_byte_perm (w5[1], w5[2], selector);
      w7[0] = hc_byte_perm (w5[0], w5[1], selector);
      w6[3] = hc_byte_perm (w4[3], w5[0], selector);
      w6[2] = hc_byte_perm (w4[2], w4[3], selector);
      w6[1] = hc_byte_perm (w4[1], w4[2], selector);
      w6[0] = hc_byte_perm (w4[0], w4[1], selector);
      w5[3] = hc_byte_perm (w3[3], w4[0], selector);
      w5[2] = hc_byte_perm (w3[2], w3[3], selector);
      w5[1] = hc_byte_perm (w3[1], w3[2], selector);
      w5[0] = hc_byte_perm (w3[0], w3[1], selector);
      w4[3] = hc_byte_perm (w2[3], w3[0], selector);
      w4[2] = hc_byte_perm (w2[2], w2[3], selector);
      w4[1] = hc_byte_perm (w2[1], w2[2], selector);
      w4[0] = hc_byte_perm (w2[0], w2[1], selector);
      w3[3] = hc_byte_perm (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 8:
      w7[3] = hc_byte_perm (w5[2], w5[3], selector);
      w7[2] = hc_byte_perm (w5[1], w5[2], selector);
      w7[1] = hc_byte_perm (w5[0], w5[1], selector);
      w7[0] = hc_byte_perm (w4[3], w5[0], selector);
      w6[3] = hc_byte_perm (w4[2], w4[3], selector);
      w6[2] = hc_byte_perm (w4[1], w4[2], selector);
      w6[1] = hc_byte_perm (w4[0], w4[1], selector);
      w6[0] = hc_byte_perm (w3[3], w4[0], selector);
      w5[3] = hc_byte_perm (w3[2], w3[3], selector);
      w5[2] = hc_byte_perm (w3[1], w3[2], selector);
      w5[1] = hc_byte_perm (w3[0], w3[1], selector);
      w5[0] = hc_byte_perm (w2[3], w3[0], selector);
      w4[3] = hc_byte_perm (w2[2], w2[3], selector);
      w4[2] = hc_byte_perm (w2[1], w2[2], selector);
      w4[1] = hc_byte_perm (w2[0], w2[1], selector);
      w4[0] = hc_byte_perm (w1[3], w2[0], selector);
      w3[3] = hc_byte_perm (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 9:
      w7[3] = hc_byte_perm (w5[1], w5[2], selector);
      w7[2] = hc_byte_perm (w5[0], w5[1], selector);
      w7[1] = hc_byte_perm (w4[3], w5[0], selector);
      w7[0] = hc_byte_perm (w4[2], w4[3], selector);
      w6[3] = hc_byte_perm (w4[1], w4[2], selector);
      w6[2] = hc_byte_perm (w4[0], w4[1], selector);
      w6[1] = hc_byte_perm (w3[3], w4[0], selector);
      w6[0] = hc_byte_perm (w3[2], w3[3], selector);
      w5[3] = hc_byte_perm (w3[1], w3[2], selector);
      w5[2] = hc_byte_perm (w3[0], w3[1], selector);
      w5[1] = hc_byte_perm (w2[3], w3[0], selector);
      w5[0] = hc_byte_perm (w2[2], w2[3], selector);
      w4[3] = hc_byte_perm (w2[1], w2[2], selector);
      w4[2] = hc_byte_perm (w2[0], w2[1], selector);
      w4[1] = hc_byte_perm (w1[3], w2[0], selector);
      w4[0] = hc_byte_perm (w1[2], w1[3], selector);
      w3[3] = hc_byte_perm (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 10:
      w7[3] = hc_byte_perm (w5[0], w5[1], selector);
      w7[2] = hc_byte_perm (w4[3], w5[0], selector);
      w7[1] = hc_byte_perm (w4[2], w4[3], selector);
      w7[0] = hc_byte_perm (w4[1], w4[2], selector);
      w6[3] = hc_byte_perm (w4[0], w4[1], selector);
      w6[2] = hc_byte_perm (w3[3], w4[0], selector);
      w6[1] = hc_byte_perm (w3[2], w3[3], selector);
      w6[0] = hc_byte_perm (w3[1], w3[2], selector);
      w5[3] = hc_byte_perm (w3[0], w3[1], selector);
      w5[2] = hc_byte_perm (w2[3], w3[0], selector);
      w5[1] = hc_byte_perm (w2[2], w2[3], selector);
      w5[0] = hc_byte_perm (w2[1], w2[2], selector);
      w4[3] = hc_byte_perm (w2[0], w2[1], selector);
      w4[2] = hc_byte_perm (w1[3], w2[0], selector);
      w4[1] = hc_byte_perm (w1[2], w1[3], selector);
      w4[0] = hc_byte_perm (w1[1], w1[2], selector);
      w3[3] = hc_byte_perm (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 11:
      w7[3] = hc_byte_perm (w4[3], w5[0], selector);
      w7[2] = hc_byte_perm (w4[2], w4[3], selector);
      w7[1] = hc_byte_perm (w4[1], w4[2], selector);
      w7[0] = hc_byte_perm (w4[0], w4[1], selector);
      w6[3] = hc_byte_perm (w3[3], w4[0], selector);
      w6[2] = hc_byte_perm (w3[2], w3[3], selector);
      w6[1] = hc_byte_perm (w3[1], w3[2], selector);
      w6[0] = hc_byte_perm (w3[0], w3[1], selector);
      w5[3] = hc_byte_perm (w2[3], w3[0], selector);
      w5[2] = hc_byte_perm (w2[2], w2[3], selector);
      w5[1] = hc_byte_perm (w2[1], w2[2], selector);
      w5[0] = hc_byte_perm (w2[0], w2[1], selector);
      w4[3] = hc_byte_perm (w1[3], w2[0], selector);
      w4[2] = hc_byte_perm (w1[2], w1[3], selector);
      w4[1] = hc_byte_perm (w1[1], w1[2], selector);
      w4[0] = hc_byte_perm (w1[0], w1[1], selector);
      w3[3] = hc_byte_perm (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 12:
      w7[3] = hc_byte_perm (w4[2], w4[3], selector);
      w7[2] = hc_byte_perm (w4[1], w4[2], selector);
      w7[1] = hc_byte_perm (w4[0], w4[1], selector);
      w7[0] = hc_byte_perm (w3[3], w4[0], selector);
      w6[3] = hc_byte_perm (w3[2], w3[3], selector);
      w6[2] = hc_byte_perm (w3[1], w3[2], selector);
      w6[1] = hc_byte_perm (w3[0], w3[1], selector);
      w6[0] = hc_byte_perm (w2[3], w3[0], selector);
      w5[3] = hc_byte_perm (w2[2], w2[3], selector);
      w5[2] = hc_byte_perm (w2[1], w2[2], selector);
      w5[1] = hc_byte_perm (w2[0], w2[1], selector);
      w5[0] = hc_byte_perm (w1[3], w2[0], selector);
      w4[3] = hc_byte_perm (w1[2], w1[3], selector);
      w4[2] = hc_byte_perm (w1[1], w1[2], selector);
      w4[1] = hc_byte_perm (w1[0], w1[1], selector);
      w4[0] = hc_byte_perm (w0[3], w1[0], selector);
      w3[3] = hc_byte_perm (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 13:
      w7[3] = hc_byte_perm (w4[1], w4[2], selector);
      w7[2] = hc_byte_perm (w4[0], w4[1], selector);
      w7[1] = hc_byte_perm (w3[3], w4[0], selector);
      w7[0] = hc_byte_perm (w3[2], w3[3], selector);
      w6[3] = hc_byte_perm (w3[1], w3[2], selector);
      w6[2] = hc_byte_perm (w3[0], w3[1], selector);
      w6[1] = hc_byte_perm (w2[3], w3[0], selector);
      w6[0] = hc_byte_perm (w2[2], w2[3], selector);
      w5[3] = hc_byte_perm (w2[1], w2[2], selector);
      w5[2] = hc_byte_perm (w2[0], w2[1], selector);
      w5[1] = hc_byte_perm (w1[3], w2[0], selector);
      w5[0] = hc_byte_perm (w1[2], w1[3], selector);
      w4[3] = hc_byte_perm (w1[1], w1[2], selector);
      w4[2] = hc_byte_perm (w1[0], w1[1], selector);
      w4[1] = hc_byte_perm (w0[3], w1[0], selector);
      w4[0] = hc_byte_perm (w0[2], w0[3], selector);
      w3[3] = hc_byte_perm (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 14:
      w7[3] = hc_byte_perm (w4[0], w4[1], selector);
      w7[2] = hc_byte_perm (w3[3], w4[0], selector);
      w7[1] = hc_byte_perm (w3[2], w3[3], selector);
      w7[0] = hc_byte_perm (w3[1], w3[2], selector);
      w6[3] = hc_byte_perm (w3[0], w3[1], selector);
      w6[2] = hc_byte_perm (w2[3], w3[0], selector);
      w6[1] = hc_byte_perm (w2[2], w2[3], selector);
      w6[0] = hc_byte_perm (w2[1], w2[2], selector);
      w5[3] = hc_byte_perm (w2[0], w2[1], selector);
      w5[2] = hc_byte_perm (w1[3], w2[0], selector);
      w5[1] = hc_byte_perm (w1[2], w1[3], selector);
      w5[0] = hc_byte_perm (w1[1], w1[2], selector);
      w4[3] = hc_byte_perm (w1[0], w1[1], selector);
      w4[2] = hc_byte_perm (w0[3], w1[0], selector);
      w4[1] = hc_byte_perm (w0[2], w0[3], selector);
      w4[0] = hc_byte_perm (w0[1], w0[2], selector);
      w3[3] = hc_byte_perm (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 15:
      w7[3] = hc_byte_perm (w3[3], w4[0], selector);
      w7[2] = hc_byte_perm (w3[2], w3[3], selector);
      w7[1] = hc_byte_perm (w3[1], w3[2], selector);
      w7[0] = hc_byte_perm (w3[0], w3[1], selector);
      w6[3] = hc_byte_perm (w2[3], w3[0], selector);
      w6[2] = hc_byte_perm (w2[2], w2[3], selector);
      w6[1] = hc_byte_perm (w2[1], w2[2], selector);
      w6[0] = hc_byte_perm (w2[0], w2[1], selector);
      w5[3] = hc_byte_perm (w1[3], w2[0], selector);
      w5[2] = hc_byte_perm (w1[2], w1[3], selector);
      w5[1] = hc_byte_perm (w1[1], w1[2], selector);
      w5[0] = hc_byte_perm (w1[0], w1[1], selector);
      w4[3] = hc_byte_perm (w0[3], w1[0], selector);
      w4[2] = hc_byte_perm (w0[2], w0[3], selector);
      w4[1] = hc_byte_perm (w0[1], w0[2], selector);
      w4[0] = hc_byte_perm (w0[0], w0[1], selector);
      w3[3] = hc_byte_perm (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign (w7[2], w7[3], offset);
      w7[2] = hc_bytealign (w7[1], w7[2], offset);
      w7[1] = hc_bytealign (w7[0], w7[1], offset);
      w7[0] = hc_bytealign (w6[3], w7[0], offset);
      w6[3] = hc_bytealign (w6[2], w6[3], offset);
      w6[2] = hc_bytealign (w6[1], w6[2], offset);
      w6[1] = hc_bytealign (w6[0], w6[1], offset);
      w6[0] = hc_bytealign (w5[3], w6[0], offset);
      w5[3] = hc_bytealign (w5[2], w5[3], offset);
      w5[2] = hc_bytealign (w5[1], w5[2], offset);
      w5[1] = hc_bytealign (w5[0], w5[1], offset);
      w5[0] = hc_bytealign (w4[3], w5[0], offset);
      w4[3] = hc_bytealign (w4[2], w4[3], offset);
      w4[2] = hc_bytealign (w4[1], w4[2], offset);
      w4[1] = hc_bytealign (w4[0], w4[1], offset);
      w4[0] = hc_bytealign (w3[3], w4[0], offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign (w7[1], w7[2], offset);
      w7[2] = hc_bytealign (w7[0], w7[1], offset);
      w7[1] = hc_bytealign (w6[3], w7[0], offset);
      w7[0] = hc_bytealign (w6[2], w6[3], offset);
      w6[3] = hc_bytealign (w6[1], w6[2], offset);
      w6[2] = hc_bytealign (w6[0], w6[1], offset);
      w6[1] = hc_bytealign (w5[3], w6[0], offset);
      w6[0] = hc_bytealign (w5[2], w5[3], offset);
      w5[3] = hc_bytealign (w5[1], w5[2], offset);
      w5[2] = hc_bytealign (w5[0], w5[1], offset);
      w5[1] = hc_bytealign (w4[3], w5[0], offset);
      w5[0] = hc_bytealign (w4[2], w4[3], offset);
      w4[3] = hc_bytealign (w4[1], w4[2], offset);
      w4[2] = hc_bytealign (w4[0], w4[1], offset);
      w4[1] = hc_bytealign (w3[3], w4[0], offset);
      w4[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign (w7[0], w7[1], offset);
      w7[2] = hc_bytealign (w6[3], w7[0], offset);
      w7[1] = hc_bytealign (w6[2], w6[3], offset);
      w7[0] = hc_bytealign (w6[1], w6[2], offset);
      w6[3] = hc_bytealign (w6[0], w6[1], offset);
      w6[2] = hc_bytealign (w5[3], w6[0], offset);
      w6[1] = hc_bytealign (w5[2], w5[3], offset);
      w6[0] = hc_bytealign (w5[1], w5[2], offset);
      w5[3] = hc_bytealign (w5[0], w5[1], offset);
      w5[2] = hc_bytealign (w4[3], w5[0], offset);
      w5[1] = hc_bytealign (w4[2], w4[3], offset);
      w5[0] = hc_bytealign (w4[1], w4[2], offset);
      w4[3] = hc_bytealign (w4[0], w4[1], offset);
      w4[2] = hc_bytealign (w3[3], w4[0], offset);
      w4[1] = hc_bytealign (w3[2], w3[3], offset);
      w4[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign (w6[3], w7[0], offset);
      w7[2] = hc_bytealign (w6[2], w6[3], offset);
      w7[1] = hc_bytealign (w6[1], w6[2], offset);
      w7[0] = hc_bytealign (w6[0], w6[1], offset);
      w6[3] = hc_bytealign (w5[3], w6[0], offset);
      w6[2] = hc_bytealign (w5[2], w5[3], offset);
      w6[1] = hc_bytealign (w5[1], w5[2], offset);
      w6[0] = hc_bytealign (w5[0], w5[1], offset);
      w5[3] = hc_bytealign (w4[3], w5[0], offset);
      w5[2] = hc_bytealign (w4[2], w4[3], offset);
      w5[1] = hc_bytealign (w4[1], w4[2], offset);
      w5[0] = hc_bytealign (w4[0], w4[1], offset);
      w4[3] = hc_bytealign (w3[3], w4[0], offset);
      w4[2] = hc_bytealign (w3[2], w3[3], offset);
      w4[1] = hc_bytealign (w3[1], w3[2], offset);
      w4[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign (w6[2], w6[3], offset);
      w7[2] = hc_bytealign (w6[1], w6[2], offset);
      w7[1] = hc_bytealign (w6[0], w6[1], offset);
      w7[0] = hc_bytealign (w5[3], w6[0], offset);
      w6[3] = hc_bytealign (w5[2], w5[3], offset);
      w6[2] = hc_bytealign (w5[1], w5[2], offset);
      w6[1] = hc_bytealign (w5[0], w5[1], offset);
      w6[0] = hc_bytealign (w4[3], w5[0], offset);
      w5[3] = hc_bytealign (w4[2], w4[3], offset);
      w5[2] = hc_bytealign (w4[1], w4[2], offset);
      w5[1] = hc_bytealign (w4[0], w4[1], offset);
      w5[0] = hc_bytealign (w3[3], w4[0], offset);
      w4[3] = hc_bytealign (w3[2], w3[3], offset);
      w4[2] = hc_bytealign (w3[1], w3[2], offset);
      w4[1] = hc_bytealign (w3[0], w3[1], offset);
      w4[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign (w6[1], w6[2], offset);
      w7[2] = hc_bytealign (w6[0], w6[1], offset);
      w7[1] = hc_bytealign (w5[3], w6[0], offset);
      w7[0] = hc_bytealign (w5[2], w5[3], offset);
      w6[3] = hc_bytealign (w5[1], w5[2], offset);
      w6[2] = hc_bytealign (w5[0], w5[1], offset);
      w6[1] = hc_bytealign (w4[3], w5[0], offset);
      w6[0] = hc_bytealign (w4[2], w4[3], offset);
      w5[3] = hc_bytealign (w4[1], w4[2], offset);
      w5[2] = hc_bytealign (w4[0], w4[1], offset);
      w5[1] = hc_bytealign (w3[3], w4[0], offset);
      w5[0] = hc_bytealign (w3[2], w3[3], offset);
      w4[3] = hc_bytealign (w3[1], w3[2], offset);
      w4[2] = hc_bytealign (w3[0], w3[1], offset);
      w4[1] = hc_bytealign (w2[3], w3[0], offset);
      w4[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign (w6[0], w6[1], offset);
      w7[2] = hc_bytealign (w5[3], w6[0], offset);
      w7[1] = hc_bytealign (w5[2], w5[3], offset);
      w7[0] = hc_bytealign (w5[1], w5[2], offset);
      w6[3] = hc_bytealign (w5[0], w5[1], offset);
      w6[2] = hc_bytealign (w4[3], w5[0], offset);
      w6[1] = hc_bytealign (w4[2], w4[3], offset);
      w6[0] = hc_bytealign (w4[1], w4[2], offset);
      w5[3] = hc_bytealign (w4[0], w4[1], offset);
      w5[2] = hc_bytealign (w3[3], w4[0], offset);
      w5[1] = hc_bytealign (w3[2], w3[3], offset);
      w5[0] = hc_bytealign (w3[1], w3[2], offset);
      w4[3] = hc_bytealign (w3[0], w3[1], offset);
      w4[2] = hc_bytealign (w2[3], w3[0], offset);
      w4[1] = hc_bytealign (w2[2], w2[3], offset);
      w4[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign (w5[3], w6[0], offset);
      w7[2] = hc_bytealign (w5[2], w5[3], offset);
      w7[1] = hc_bytealign (w5[1], w5[2], offset);
      w7[0] = hc_bytealign (w5[0], w5[1], offset);
      w6[3] = hc_bytealign (w4[3], w5[0], offset);
      w6[2] = hc_bytealign (w4[2], w4[3], offset);
      w6[1] = hc_bytealign (w4[1], w4[2], offset);
      w6[0] = hc_bytealign (w4[0], w4[1], offset);
      w5[3] = hc_bytealign (w3[3], w4[0], offset);
      w5[2] = hc_bytealign (w3[2], w3[3], offset);
      w5[1] = hc_bytealign (w3[1], w3[2], offset);
      w5[0] = hc_bytealign (w3[0], w3[1], offset);
      w4[3] = hc_bytealign (w2[3], w3[0], offset);
      w4[2] = hc_bytealign (w2[2], w2[3], offset);
      w4[1] = hc_bytealign (w2[1], w2[2], offset);
      w4[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign (w5[2], w5[3], offset);
      w7[2] = hc_bytealign (w5[1], w5[2], offset);
      w7[1] = hc_bytealign (w5[0], w5[1], offset);
      w7[0] = hc_bytealign (w4[3], w5[0], offset);
      w6[3] = hc_bytealign (w4[2], w4[3], offset);
      w6[2] = hc_bytealign (w4[1], w4[2], offset);
      w6[1] = hc_bytealign (w4[0], w4[1], offset);
      w6[0] = hc_bytealign (w3[3], w4[0], offset);
      w5[3] = hc_bytealign (w3[2], w3[3], offset);
      w5[2] = hc_bytealign (w3[1], w3[2], offset);
      w5[1] = hc_bytealign (w3[0], w3[1], offset);
      w5[0] = hc_bytealign (w2[3], w3[0], offset);
      w4[3] = hc_bytealign (w2[2], w2[3], offset);
      w4[2] = hc_bytealign (w2[1], w2[2], offset);
      w4[1] = hc_bytealign (w2[0], w2[1], offset);
      w4[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign (w5[1], w5[2], offset);
      w7[2] = hc_bytealign (w5[0], w5[1], offset);
      w7[1] = hc_bytealign (w4[3], w5[0], offset);
      w7[0] = hc_bytealign (w4[2], w4[3], offset);
      w6[3] = hc_bytealign (w4[1], w4[2], offset);
      w6[2] = hc_bytealign (w4[0], w4[1], offset);
      w6[1] = hc_bytealign (w3[3], w4[0], offset);
      w6[0] = hc_bytealign (w3[2], w3[3], offset);
      w5[3] = hc_bytealign (w3[1], w3[2], offset);
      w5[2] = hc_bytealign (w3[0], w3[1], offset);
      w5[1] = hc_bytealign (w2[3], w3[0], offset);
      w5[0] = hc_bytealign (w2[2], w2[3], offset);
      w4[3] = hc_bytealign (w2[1], w2[2], offset);
      w4[2] = hc_bytealign (w2[0], w2[1], offset);
      w4[1] = hc_bytealign (w1[3], w2[0], offset);
      w4[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign (w5[0], w5[1], offset);
      w7[2] = hc_bytealign (w4[3], w5[0], offset);
      w7[1] = hc_bytealign (w4[2], w4[3], offset);
      w7[0] = hc_bytealign (w4[1], w4[2], offset);
      w6[3] = hc_bytealign (w4[0], w4[1], offset);
      w6[2] = hc_bytealign (w3[3], w4[0], offset);
      w6[1] = hc_bytealign (w3[2], w3[3], offset);
      w6[0] = hc_bytealign (w3[1], w3[2], offset);
      w5[3] = hc_bytealign (w3[0], w3[1], offset);
      w5[2] = hc_bytealign (w2[3], w3[0], offset);
      w5[1] = hc_bytealign (w2[2], w2[3], offset);
      w5[0] = hc_bytealign (w2[1], w2[2], offset);
      w4[3] = hc_bytealign (w2[0], w2[1], offset);
      w4[2] = hc_bytealign (w1[3], w2[0], offset);
      w4[1] = hc_bytealign (w1[2], w1[3], offset);
      w4[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign (w4[3], w5[0], offset);
      w7[2] = hc_bytealign (w4[2], w4[3], offset);
      w7[1] = hc_bytealign (w4[1], w4[2], offset);
      w7[0] = hc_bytealign (w4[0], w4[1], offset);
      w6[3] = hc_bytealign (w3[3], w4[0], offset);
      w6[2] = hc_bytealign (w3[2], w3[3], offset);
      w6[1] = hc_bytealign (w3[1], w3[2], offset);
      w6[0] = hc_bytealign (w3[0], w3[1], offset);
      w5[3] = hc_bytealign (w2[3], w3[0], offset);
      w5[2] = hc_bytealign (w2[2], w2[3], offset);
      w5[1] = hc_bytealign (w2[1], w2[2], offset);
      w5[0] = hc_bytealign (w2[0], w2[1], offset);
      w4[3] = hc_bytealign (w1[3], w2[0], offset);
      w4[2] = hc_bytealign (w1[2], w1[3], offset);
      w4[1] = hc_bytealign (w1[1], w1[2], offset);
      w4[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign (w4[2], w4[3], offset);
      w7[2] = hc_bytealign (w4[1], w4[2], offset);
      w7[1] = hc_bytealign (w4[0], w4[1], offset);
      w7[0] = hc_bytealign (w3[3], w4[0], offset);
      w6[3] = hc_bytealign (w3[2], w3[3], offset);
      w6[2] = hc_bytealign (w3[1], w3[2], offset);
      w6[1] = hc_bytealign (w3[0], w3[1], offset);
      w6[0] = hc_bytealign (w2[3], w3[0], offset);
      w5[3] = hc_bytealign (w2[2], w2[3], offset);
      w5[2] = hc_bytealign (w2[1], w2[2], offset);
      w5[1] = hc_bytealign (w2[0], w2[1], offset);
      w5[0] = hc_bytealign (w1[3], w2[0], offset);
      w4[3] = hc_bytealign (w1[2], w1[3], offset);
      w4[2] = hc_bytealign (w1[1], w1[2], offset);
      w4[1] = hc_bytealign (w1[0], w1[1], offset);
      w4[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign (w4[1], w4[2], offset);
      w7[2] = hc_bytealign (w4[0], w4[1], offset);
      w7[1] = hc_bytealign (w3[3], w4[0], offset);
      w7[0] = hc_bytealign (w3[2], w3[3], offset);
      w6[3] = hc_bytealign (w3[1], w3[2], offset);
      w6[2] = hc_bytealign (w3[0], w3[1], offset);
      w6[1] = hc_bytealign (w2[3], w3[0], offset);
      w6[0] = hc_bytealign (w2[2], w2[3], offset);
      w5[3] = hc_bytealign (w2[1], w2[2], offset);
      w5[2] = hc_bytealign (w2[0], w2[1], offset);
      w5[1] = hc_bytealign (w1[3], w2[0], offset);
      w5[0] = hc_bytealign (w1[2], w1[3], offset);
      w4[3] = hc_bytealign (w1[1], w1[2], offset);
      w4[2] = hc_bytealign (w1[0], w1[1], offset);
      w4[1] = hc_bytealign (w0[3], w1[0], offset);
      w4[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign (w4[0], w4[1], offset);
      w7[2] = hc_bytealign (w3[3], w4[0], offset);
      w7[1] = hc_bytealign (w3[2], w3[3], offset);
      w7[0] = hc_bytealign (w3[1], w3[2], offset);
      w6[3] = hc_bytealign (w3[0], w3[1], offset);
      w6[2] = hc_bytealign (w2[3], w3[0], offset);
      w6[1] = hc_bytealign (w2[2], w2[3], offset);
      w6[0] = hc_bytealign (w2[1], w2[2], offset);
      w5[3] = hc_bytealign (w2[0], w2[1], offset);
      w5[2] = hc_bytealign (w1[3], w2[0], offset);
      w5[1] = hc_bytealign (w1[2], w1[3], offset);
      w5[0] = hc_bytealign (w1[1], w1[2], offset);
      w4[3] = hc_bytealign (w1[0], w1[1], offset);
      w4[2] = hc_bytealign (w0[3], w1[0], offset);
      w4[1] = hc_bytealign (w0[2], w0[3], offset);
      w4[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign (w3[3], w4[0], offset);
      w7[2] = hc_bytealign (w3[2], w3[3], offset);
      w7[1] = hc_bytealign (w3[1], w3[2], offset);
      w7[0] = hc_bytealign (w3[0], w3[1], offset);
      w6[3] = hc_bytealign (w2[3], w3[0], offset);
      w6[2] = hc_bytealign (w2[2], w2[3], offset);
      w6[1] = hc_bytealign (w2[1], w2[2], offset);
      w6[0] = hc_bytealign (w2[0], w2[1], offset);
      w5[3] = hc_bytealign (w1[3], w2[0], offset);
      w5[2] = hc_bytealign (w1[2], w1[3], offset);
      w5[1] = hc_bytealign (w1[1], w1[2], offset);
      w5[0] = hc_bytealign (w1[0], w1[1], offset);
      w4[3] = hc_bytealign (w0[3], w1[0], offset);
      w4[2] = hc_bytealign (w0[2], w0[3], offset);
      w4[1] = hc_bytealign (w0[1], w0[2], offset);
      w4[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign (w3[2], w3[3], offset);
      w7[2] = hc_bytealign (w3[1], w3[2], offset);
      w7[1] = hc_bytealign (w3[0], w3[1], offset);
      w7[0] = hc_bytealign (w2[3], w3[0], offset);
      w6[3] = hc_bytealign (w2[2], w2[3], offset);
      w6[2] = hc_bytealign (w2[1], w2[2], offset);
      w6[1] = hc_bytealign (w2[0], w2[1], offset);
      w6[0] = hc_bytealign (w1[3], w2[0], offset);
      w5[3] = hc_bytealign (w1[2], w1[3], offset);
      w5[2] = hc_bytealign (w1[1], w1[2], offset);
      w5[1] = hc_bytealign (w1[0], w1[1], offset);
      w5[0] = hc_bytealign (w0[3], w1[0], offset);
      w4[3] = hc_bytealign (w0[2], w0[3], offset);
      w4[2] = hc_bytealign (w0[1], w0[2], offset);
      w4[1] = hc_bytealign (w0[0], w0[1], offset);
      w4[0] = hc_bytealign (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign (w3[1], w3[2], offset);
      w7[2] = hc_bytealign (w3[0], w3[1], offset);
      w7[1] = hc_bytealign (w2[3], w3[0], offset);
      w7[0] = hc_bytealign (w2[2], w2[3], offset);
      w6[3] = hc_bytealign (w2[1], w2[2], offset);
      w6[2] = hc_bytealign (w2[0], w2[1], offset);
      w6[1] = hc_bytealign (w1[3], w2[0], offset);
      w6[0] = hc_bytealign (w1[2], w1[3], offset);
      w5[3] = hc_bytealign (w1[1], w1[2], offset);
      w5[2] = hc_bytealign (w1[0], w1[1], offset);
      w5[1] = hc_bytealign (w0[3], w1[0], offset);
      w5[0] = hc_bytealign (w0[2], w0[3], offset);
      w4[3] = hc_bytealign (w0[1], w0[2], offset);
      w4[2] = hc_bytealign (w0[0], w0[1], offset);
      w4[1] = hc_bytealign (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign (w3[0], w3[1], offset);
      w7[2] = hc_bytealign (w2[3], w3[0], offset);
      w7[1] = hc_bytealign (w2[2], w2[3], offset);
      w7[0] = hc_bytealign (w2[1], w2[2], offset);
      w6[3] = hc_bytealign (w2[0], w2[1], offset);
      w6[2] = hc_bytealign (w1[3], w2[0], offset);
      w6[1] = hc_bytealign (w1[2], w1[3], offset);
      w6[0] = hc_bytealign (w1[1], w1[2], offset);
      w5[3] = hc_bytealign (w1[0], w1[1], offset);
      w5[2] = hc_bytealign (w0[3], w1[0], offset);
      w5[1] = hc_bytealign (w0[2], w0[3], offset);
      w5[0] = hc_bytealign (w0[1], w0[2], offset);
      w4[3] = hc_bytealign (w0[0], w0[1], offset);
      w4[2] = hc_bytealign (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign (w2[3], w3[0], offset);
      w7[2] = hc_bytealign (w2[2], w2[3], offset);
      w7[1] = hc_bytealign (w2[1], w2[2], offset);
      w7[0] = hc_bytealign (w2[0], w2[1], offset);
      w6[3] = hc_bytealign (w1[3], w2[0], offset);
      w6[2] = hc_bytealign (w1[2], w1[3], offset);
      w6[1] = hc_bytealign (w1[1], w1[2], offset);
      w6[0] = hc_bytealign (w1[0], w1[1], offset);
      w5[3] = hc_bytealign (w0[3], w1[0], offset);
      w5[2] = hc_bytealign (w0[2], w0[3], offset);
      w5[1] = hc_bytealign (w0[1], w0[2], offset);
      w5[0] = hc_bytealign (w0[0], w0[1], offset);
      w4[3] = hc_bytealign (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign (w2[2], w2[3], offset);
      w7[2] = hc_bytealign (w2[1], w2[2], offset);
      w7[1] = hc_bytealign (w2[0], w2[1], offset);
      w7[0] = hc_bytealign (w1[3], w2[0], offset);
      w6[3] = hc_bytealign (w1[2], w1[3], offset);
      w6[2] = hc_bytealign (w1[1], w1[2], offset);
      w6[1] = hc_bytealign (w1[0], w1[1], offset);
      w6[0] = hc_bytealign (w0[3], w1[0], offset);
      w5[3] = hc_bytealign (w0[2], w0[3], offset);
      w5[2] = hc_bytealign (w0[1], w0[2], offset);
      w5[1] = hc_bytealign (w0[0], w0[1], offset);
      w5[0] = hc_bytealign (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign (w2[1], w2[2], offset);
      w7[2] = hc_bytealign (w2[0], w2[1], offset);
      w7[1] = hc_bytealign (w1[3], w2[0], offset);
      w7[0] = hc_bytealign (w1[2], w1[3], offset);
      w6[3] = hc_bytealign (w1[1], w1[2], offset);
      w6[2] = hc_bytealign (w1[0], w1[1], offset);
      w6[1] = hc_bytealign (w0[3], w1[0], offset);
      w6[0] = hc_bytealign (w0[2], w0[3], offset);
      w5[3] = hc_bytealign (w0[1], w0[2], offset);
      w5[2] = hc_bytealign (w0[0], w0[1], offset);
      w5[1] = hc_bytealign (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign (w2[0], w2[1], offset);
      w7[2] = hc_bytealign (w1[3], w2[0], offset);
      w7[1] = hc_bytealign (w1[2], w1[3], offset);
      w7[0] = hc_bytealign (w1[1], w1[2], offset);
      w6[3] = hc_bytealign (w1[0], w1[1], offset);
      w6[2] = hc_bytealign (w0[3], w1[0], offset);
      w6[1] = hc_bytealign (w0[2], w0[3], offset);
      w6[0] = hc_bytealign (w0[1], w0[2], offset);
      w5[3] = hc_bytealign (w0[0], w0[1], offset);
      w5[2] = hc_bytealign (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign (w1[3], w2[0], offset);
      w7[2] = hc_bytealign (w1[2], w1[3], offset);
      w7[1] = hc_bytealign (w1[1], w1[2], offset);
      w7[0] = hc_bytealign (w1[0], w1[1], offset);
      w6[3] = hc_bytealign (w0[3], w1[0], offset);
      w6[2] = hc_bytealign (w0[2], w0[3], offset);
      w6[1] = hc_bytealign (w0[1], w0[2], offset);
      w6[0] = hc_bytealign (w0[0], w0[1], offset);
      w5[3] = hc_bytealign (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign (w1[2], w1[3], offset);
      w7[2] = hc_bytealign (w1[1], w1[2], offset);
      w7[1] = hc_bytealign (w1[0], w1[1], offset);
      w7[0] = hc_bytealign (w0[3], w1[0], offset);
      w6[3] = hc_bytealign (w0[2], w0[3], offset);
      w6[2] = hc_bytealign (w0[1], w0[2], offset);
      w6[1] = hc_bytealign (w0[0], w0[1], offset);
      w6[0] = hc_bytealign (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign (w1[1], w1[2], offset);
      w7[2] = hc_bytealign (w1[0], w1[1], offset);
      w7[1] = hc_bytealign (w0[3], w1[0], offset);
      w7[0] = hc_bytealign (w0[2], w0[3], offset);
      w6[3] = hc_bytealign (w0[1], w0[2], offset);
      w6[2] = hc_bytealign (w0[0], w0[1], offset);
      w6[1] = hc_bytealign (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign (w1[0], w1[1], offset);
      w7[2] = hc_bytealign (w0[3], w1[0], offset);
      w7[1] = hc_bytealign (w0[2], w0[3], offset);
      w7[0] = hc_bytealign (w0[1], w0[2], offset);
      w6[3] = hc_bytealign (w0[0], w0[1], offset);
      w6[2] = hc_bytealign (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign (w0[3], w1[0], offset);
      w7[2] = hc_bytealign (w0[2], w0[3], offset);
      w7[1] = hc_bytealign (w0[1], w0[2], offset);
      w7[0] = hc_bytealign (w0[0], w0[1], offset);
      w6[3] = hc_bytealign (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign (w0[2], w0[3], offset);
      w7[2] = hc_bytealign (w0[1], w0[2], offset);
      w7[1] = hc_bytealign (w0[0], w0[1], offset);
      w7[0] = hc_bytealign (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign (w0[1], w0[2], offset);
      w7[2] = hc_bytealign (w0[0], w0[1], offset);
      w7[1] = hc_bytealign (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign (w0[0], w0[1], offset);
      w7[2] = hc_bytealign (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_byte_perm (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      w7[3] = hc_byte_perm (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_byte_perm (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_byte_perm (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_byte_perm (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_byte_perm (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_byte_perm (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_byte_perm (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_byte_perm (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_byte_perm (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_byte_perm (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_byte_perm (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_byte_perm (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_byte_perm (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_byte_perm (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_byte_perm (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_byte_perm (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_byte_perm (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_byte_perm (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_byte_perm (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_byte_perm (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_byte_perm (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_byte_perm (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_byte_perm (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_byte_perm (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_byte_perm (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_byte_perm (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_byte_perm (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_byte_perm (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_byte_perm (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_byte_perm (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_byte_perm (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign (w7[3],     0, offset);
      w7[3] = hc_bytealign (w7[2], w7[3], offset);
      w7[2] = hc_bytealign (w7[1], w7[2], offset);
      w7[1] = hc_bytealign (w7[0], w7[1], offset);
      w7[0] = hc_bytealign (w6[3], w7[0], offset);
      w6[3] = hc_bytealign (w6[2], w6[3], offset);
      w6[2] = hc_bytealign (w6[1], w6[2], offset);
      w6[1] = hc_bytealign (w6[0], w6[1], offset);
      w6[0] = hc_bytealign (w5[3], w6[0], offset);
      w5[3] = hc_bytealign (w5[2], w5[3], offset);
      w5[2] = hc_bytealign (w5[1], w5[2], offset);
      w5[1] = hc_bytealign (w5[0], w5[1], offset);
      w5[0] = hc_bytealign (w4[3], w5[0], offset);
      w4[3] = hc_bytealign (w4[2], w4[3], offset);
      w4[2] = hc_bytealign (w4[1], w4[2], offset);
      w4[1] = hc_bytealign (w4[0], w4[1], offset);
      w4[0] = hc_bytealign (w3[3], w4[0], offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign (w7[3],     0, offset);
      c0[0] = hc_bytealign (w7[2], w7[3], offset);
      w7[3] = hc_bytealign (w7[1], w7[2], offset);
      w7[2] = hc_bytealign (w7[0], w7[1], offset);
      w7[1] = hc_bytealign (w6[3], w7[0], offset);
      w7[0] = hc_bytealign (w6[2], w6[3], offset);
      w6[3] = hc_bytealign (w6[1], w6[2], offset);
      w6[2] = hc_bytealign (w6[0], w6[1], offset);
      w6[1] = hc_bytealign (w5[3], w6[0], offset);
      w6[0] = hc_bytealign (w5[2], w5[3], offset);
      w5[3] = hc_bytealign (w5[1], w5[2], offset);
      w5[2] = hc_bytealign (w5[0], w5[1], offset);
      w5[1] = hc_bytealign (w4[3], w5[0], offset);
      w5[0] = hc_bytealign (w4[2], w4[3], offset);
      w4[3] = hc_bytealign (w4[1], w4[2], offset);
      w4[2] = hc_bytealign (w4[0], w4[1], offset);
      w4[1] = hc_bytealign (w3[3], w4[0], offset);
      w4[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign (w7[3],     0, offset);
      c0[1] = hc_bytealign (w7[2], w7[3], offset);
      c0[0] = hc_bytealign (w7[1], w7[2], offset);
      w7[3] = hc_bytealign (w7[0], w7[1], offset);
      w7[2] = hc_bytealign (w6[3], w7[0], offset);
      w7[1] = hc_bytealign (w6[2], w6[3], offset);
      w7[0] = hc_bytealign (w6[1], w6[2], offset);
      w6[3] = hc_bytealign (w6[0], w6[1], offset);
      w6[2] = hc_bytealign (w5[3], w6[0], offset);
      w6[1] = hc_bytealign (w5[2], w5[3], offset);
      w6[0] = hc_bytealign (w5[1], w5[2], offset);
      w5[3] = hc_bytealign (w5[0], w5[1], offset);
      w5[2] = hc_bytealign (w4[3], w5[0], offset);
      w5[1] = hc_bytealign (w4[2], w4[3], offset);
      w5[0] = hc_bytealign (w4[1], w4[2], offset);
      w4[3] = hc_bytealign (w4[0], w4[1], offset);
      w4[2] = hc_bytealign (w3[3], w4[0], offset);
      w4[1] = hc_bytealign (w3[2], w3[3], offset);
      w4[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign (w7[3],     0, offset);
      c0[2] = hc_bytealign (w7[2], w7[3], offset);
      c0[1] = hc_bytealign (w7[1], w7[2], offset);
      c0[0] = hc_bytealign (w7[0], w7[1], offset);
      w7[3] = hc_bytealign (w6[3], w7[0], offset);
      w7[2] = hc_bytealign (w6[2], w6[3], offset);
      w7[1] = hc_bytealign (w6[1], w6[2], offset);
      w7[0] = hc_bytealign (w6[0], w6[1], offset);
      w6[3] = hc_bytealign (w5[3], w6[0], offset);
      w6[2] = hc_bytealign (w5[2], w5[3], offset);
      w6[1] = hc_bytealign (w5[1], w5[2], offset);
      w6[0] = hc_bytealign (w5[0], w5[1], offset);
      w5[3] = hc_bytealign (w4[3], w5[0], offset);
      w5[2] = hc_bytealign (w4[2], w4[3], offset);
      w5[1] = hc_bytealign (w4[1], w4[2], offset);
      w5[0] = hc_bytealign (w4[0], w4[1], offset);
      w4[3] = hc_bytealign (w3[3], w4[0], offset);
      w4[2] = hc_bytealign (w3[2], w3[3], offset);
      w4[1] = hc_bytealign (w3[1], w3[2], offset);
      w4[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign (w7[3],     0, offset);
      c0[3] = hc_bytealign (w7[2], w7[3], offset);
      c0[2] = hc_bytealign (w7[1], w7[2], offset);
      c0[1] = hc_bytealign (w7[0], w7[1], offset);
      c0[0] = hc_bytealign (w6[3], w7[0], offset);
      w7[3] = hc_bytealign (w6[2], w6[3], offset);
      w7[2] = hc_bytealign (w6[1], w6[2], offset);
      w7[1] = hc_bytealign (w6[0], w6[1], offset);
      w7[0] = hc_bytealign (w5[3], w6[0], offset);
      w6[3] = hc_bytealign (w5[2], w5[3], offset);
      w6[2] = hc_bytealign (w5[1], w5[2], offset);
      w6[1] = hc_bytealign (w5[0], w5[1], offset);
      w6[0] = hc_bytealign (w4[3], w5[0], offset);
      w5[3] = hc_bytealign (w4[2], w4[3], offset);
      w5[2] = hc_bytealign (w4[1], w4[2], offset);
      w5[1] = hc_bytealign (w4[0], w4[1], offset);
      w5[0] = hc_bytealign (w3[3], w4[0], offset);
      w4[3] = hc_bytealign (w3[2], w3[3], offset);
      w4[2] = hc_bytealign (w3[1], w3[2], offset);
      w4[1] = hc_bytealign (w3[0], w3[1], offset);
      w4[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign (w7[3],     0, offset);
      c1[0] = hc_bytealign (w7[2], w7[3], offset);
      c0[3] = hc_bytealign (w7[1], w7[2], offset);
      c0[2] = hc_bytealign (w7[0], w7[1], offset);
      c0[1] = hc_bytealign (w6[3], w7[0], offset);
      c0[0] = hc_bytealign (w6[2], w6[3], offset);
      w7[3] = hc_bytealign (w6[1], w6[2], offset);
      w7[2] = hc_bytealign (w6[0], w6[1], offset);
      w7[1] = hc_bytealign (w5[3], w6[0], offset);
      w7[0] = hc_bytealign (w5[2], w5[3], offset);
      w6[3] = hc_bytealign (w5[1], w5[2], offset);
      w6[2] = hc_bytealign (w5[0], w5[1], offset);
      w6[1] = hc_bytealign (w4[3], w5[0], offset);
      w6[0] = hc_bytealign (w4[2], w4[3], offset);
      w5[3] = hc_bytealign (w4[1], w4[2], offset);
      w5[2] = hc_bytealign (w4[0], w4[1], offset);
      w5[1] = hc_bytealign (w3[3], w4[0], offset);
      w5[0] = hc_bytealign (w3[2], w3[3], offset);
      w4[3] = hc_bytealign (w3[1], w3[2], offset);
      w4[2] = hc_bytealign (w3[0], w3[1], offset);
      w4[1] = hc_bytealign (w2[3], w3[0], offset);
      w4[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign (w7[3],     0, offset);
      c1[1] = hc_bytealign (w7[2], w7[3], offset);
      c1[0] = hc_bytealign (w7[1], w7[2], offset);
      c0[3] = hc_bytealign (w7[0], w7[1], offset);
      c0[2] = hc_bytealign (w6[3], w7[0], offset);
      c0[1] = hc_bytealign (w6[2], w6[3], offset);
      c0[0] = hc_bytealign (w6[1], w6[2], offset);
      w7[3] = hc_bytealign (w6[0], w6[1], offset);
      w7[2] = hc_bytealign (w5[3], w6[0], offset);
      w7[1] = hc_bytealign (w5[2], w5[3], offset);
      w7[0] = hc_bytealign (w5[1], w5[2], offset);
      w6[3] = hc_bytealign (w5[0], w5[1], offset);
      w6[2] = hc_bytealign (w4[3], w5[0], offset);
      w6[1] = hc_bytealign (w4[2], w4[3], offset);
      w6[0] = hc_bytealign (w4[1], w4[2], offset);
      w5[3] = hc_bytealign (w4[0], w4[1], offset);
      w5[2] = hc_bytealign (w3[3], w4[0], offset);
      w5[1] = hc_bytealign (w3[2], w3[3], offset);
      w5[0] = hc_bytealign (w3[1], w3[2], offset);
      w4[3] = hc_bytealign (w3[0], w3[1], offset);
      w4[2] = hc_bytealign (w2[3], w3[0], offset);
      w4[1] = hc_bytealign (w2[2], w2[3], offset);
      w4[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign (w7[3],     0, offset);
      c1[2] = hc_bytealign (w7[2], w7[3], offset);
      c1[1] = hc_bytealign (w7[1], w7[2], offset);
      c1[0] = hc_bytealign (w7[0], w7[1], offset);
      c0[3] = hc_bytealign (w6[3], w7[0], offset);
      c0[2] = hc_bytealign (w6[2], w6[3], offset);
      c0[1] = hc_bytealign (w6[1], w6[2], offset);
      c0[0] = hc_bytealign (w6[0], w6[1], offset);
      w7[3] = hc_bytealign (w5[3], w6[0], offset);
      w7[2] = hc_bytealign (w5[2], w5[3], offset);
      w7[1] = hc_bytealign (w5[1], w5[2], offset);
      w7[0] = hc_bytealign (w5[0], w5[1], offset);
      w6[3] = hc_bytealign (w4[3], w5[0], offset);
      w6[2] = hc_bytealign (w4[2], w4[3], offset);
      w6[1] = hc_bytealign (w4[1], w4[2], offset);
      w6[0] = hc_bytealign (w4[0], w4[1], offset);
      w5[3] = hc_bytealign (w3[3], w4[0], offset);
      w5[2] = hc_bytealign (w3[2], w3[3], offset);
      w5[1] = hc_bytealign (w3[1], w3[2], offset);
      w5[0] = hc_bytealign (w3[0], w3[1], offset);
      w4[3] = hc_bytealign (w2[3], w3[0], offset);
      w4[2] = hc_bytealign (w2[2], w2[3], offset);
      w4[1] = hc_bytealign (w2[1], w2[2], offset);
      w4[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign (w7[3],     0, offset);
      c1[3] = hc_bytealign (w7[2], w7[3], offset);
      c1[2] = hc_bytealign (w7[1], w7[2], offset);
      c1[1] = hc_bytealign (w7[0], w7[1], offset);
      c1[0] = hc_bytealign (w6[3], w7[0], offset);
      c0[3] = hc_bytealign (w6[2], w6[3], offset);
      c0[2] = hc_bytealign (w6[1], w6[2], offset);
      c0[1] = hc_bytealign (w6[0], w6[1], offset);
      c0[0] = hc_bytealign (w5[3], w6[0], offset);
      w7[3] = hc_bytealign (w5[2], w5[3], offset);
      w7[2] = hc_bytealign (w5[1], w5[2], offset);
      w7[1] = hc_bytealign (w5[0], w5[1], offset);
      w7[0] = hc_bytealign (w4[3], w5[0], offset);
      w6[3] = hc_bytealign (w4[2], w4[3], offset);
      w6[2] = hc_bytealign (w4[1], w4[2], offset);
      w6[1] = hc_bytealign (w4[0], w4[1], offset);
      w6[0] = hc_bytealign (w3[3], w4[0], offset);
      w5[3] = hc_bytealign (w3[2], w3[3], offset);
      w5[2] = hc_bytealign (w3[1], w3[2], offset);
      w5[1] = hc_bytealign (w3[0], w3[1], offset);
      w5[0] = hc_bytealign (w2[3], w3[0], offset);
      w4[3] = hc_bytealign (w2[2], w2[3], offset);
      w4[2] = hc_bytealign (w2[1], w2[2], offset);
      w4[1] = hc_bytealign (w2[0], w2[1], offset);
      w4[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign (w7[3],     0, offset);
      c2[0] = hc_bytealign (w7[2], w7[3], offset);
      c1[3] = hc_bytealign (w7[1], w7[2], offset);
      c1[2] = hc_bytealign (w7[0], w7[1], offset);
      c1[1] = hc_bytealign (w6[3], w7[0], offset);
      c1[0] = hc_bytealign (w6[2], w6[3], offset);
      c0[3] = hc_bytealign (w6[1], w6[2], offset);
      c0[2] = hc_bytealign (w6[0], w6[1], offset);
      c0[1] = hc_bytealign (w5[3], w6[0], offset);
      c0[0] = hc_bytealign (w5[2], w5[3], offset);
      w7[3] = hc_bytealign (w5[1], w5[2], offset);
      w7[2] = hc_bytealign (w5[0], w5[1], offset);
      w7[1] = hc_bytealign (w4[3], w5[0], offset);
      w7[0] = hc_bytealign (w4[2], w4[3], offset);
      w6[3] = hc_bytealign (w4[1], w4[2], offset);
      w6[2] = hc_bytealign (w4[0], w4[1], offset);
      w6[1] = hc_bytealign (w3[3], w4[0], offset);
      w6[0] = hc_bytealign (w3[2], w3[3], offset);
      w5[3] = hc_bytealign (w3[1], w3[2], offset);
      w5[2] = hc_bytealign (w3[0], w3[1], offset);
      w5[1] = hc_bytealign (w2[3], w3[0], offset);
      w5[0] = hc_bytealign (w2[2], w2[3], offset);
      w4[3] = hc_bytealign (w2[1], w2[2], offset);
      w4[2] = hc_bytealign (w2[0], w2[1], offset);
      w4[1] = hc_bytealign (w1[3], w2[0], offset);
      w4[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w7[3],     0, offset);
      c2[1] = hc_bytealign (w7[2], w7[3], offset);
      c2[0] = hc_bytealign (w7[1], w7[2], offset);
      c1[3] = hc_bytealign (w7[0], w7[1], offset);
      c1[2] = hc_bytealign (w6[3], w7[0], offset);
      c1[1] = hc_bytealign (w6[2], w6[3], offset);
      c1[0] = hc_bytealign (w6[1], w6[2], offset);
      c0[3] = hc_bytealign (w6[0], w6[1], offset);
      c0[2] = hc_bytealign (w5[3], w6[0], offset);
      c0[1] = hc_bytealign (w5[2], w5[3], offset);
      c0[0] = hc_bytealign (w5[1], w5[2], offset);
      w7[3] = hc_bytealign (w5[0], w5[1], offset);
      w7[2] = hc_bytealign (w4[3], w5[0], offset);
      w7[1] = hc_bytealign (w4[2], w4[3], offset);
      w7[0] = hc_bytealign (w4[1], w4[2], offset);
      w6[3] = hc_bytealign (w4[0], w4[1], offset);
      w6[2] = hc_bytealign (w3[3], w4[0], offset);
      w6[1] = hc_bytealign (w3[2], w3[3], offset);
      w6[0] = hc_bytealign (w3[1], w3[2], offset);
      w5[3] = hc_bytealign (w3[0], w3[1], offset);
      w5[2] = hc_bytealign (w2[3], w3[0], offset);
      w5[1] = hc_bytealign (w2[2], w2[3], offset);
      w5[0] = hc_bytealign (w2[1], w2[2], offset);
      w4[3] = hc_bytealign (w2[0], w2[1], offset);
      w4[2] = hc_bytealign (w1[3], w2[0], offset);
      w4[1] = hc_bytealign (w1[2], w1[3], offset);
      w4[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w7[3],     0, offset);
      c2[2] = hc_bytealign (w7[2], w7[3], offset);
      c2[1] = hc_bytealign (w7[1], w7[2], offset);
      c2[0] = hc_bytealign (w7[0], w7[1], offset);
      c1[3] = hc_bytealign (w6[3], w7[0], offset);
      c1[2] = hc_bytealign (w6[2], w6[3], offset);
      c1[1] = hc_bytealign (w6[1], w6[2], offset);
      c1[0] = hc_bytealign (w6[0], w6[1], offset);
      c0[3] = hc_bytealign (w5[3], w6[0], offset);
      c0[2] = hc_bytealign (w5[2], w5[3], offset);
      c0[1] = hc_bytealign (w5[1], w5[2], offset);
      c0[0] = hc_bytealign (w5[0], w5[1], offset);
      w7[3] = hc_bytealign (w4[3], w5[0], offset);
      w7[2] = hc_bytealign (w4[2], w4[3], offset);
      w7[1] = hc_bytealign (w4[1], w4[2], offset);
      w7[0] = hc_bytealign (w4[0], w4[1], offset);
      w6[3] = hc_bytealign (w3[3], w4[0], offset);
      w6[2] = hc_bytealign (w3[2], w3[3], offset);
      w6[1] = hc_bytealign (w3[1], w3[2], offset);
      w6[0] = hc_bytealign (w3[0], w3[1], offset);
      w5[3] = hc_bytealign (w2[3], w3[0], offset);
      w5[2] = hc_bytealign (w2[2], w2[3], offset);
      w5[1] = hc_bytealign (w2[1], w2[2], offset);
      w5[0] = hc_bytealign (w2[0], w2[1], offset);
      w4[3] = hc_bytealign (w1[3], w2[0], offset);
      w4[2] = hc_bytealign (w1[2], w1[3], offset);
      w4[1] = hc_bytealign (w1[1], w1[2], offset);
      w4[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w7[3],     0, offset);
      c2[3] = hc_bytealign (w7[2], w7[3], offset);
      c2[2] = hc_bytealign (w7[1], w7[2], offset);
      c2[1] = hc_bytealign (w7[0], w7[1], offset);
      c2[0] = hc_bytealign (w6[3], w7[0], offset);
      c1[3] = hc_bytealign (w6[2], w6[3], offset);
      c1[2] = hc_bytealign (w6[1], w6[2], offset);
      c1[1] = hc_bytealign (w6[0], w6[1], offset);
      c1[0] = hc_bytealign (w5[3], w6[0], offset);
      c0[3] = hc_bytealign (w5[2], w5[3], offset);
      c0[2] = hc_bytealign (w5[1], w5[2], offset);
      c0[1] = hc_bytealign (w5[0], w5[1], offset);
      c0[0] = hc_bytealign (w4[3], w5[0], offset);
      w7[3] = hc_bytealign (w4[2], w4[3], offset);
      w7[2] = hc_bytealign (w4[1], w4[2], offset);
      w7[1] = hc_bytealign (w4[0], w4[1], offset);
      w7[0] = hc_bytealign (w3[3], w4[0], offset);
      w6[3] = hc_bytealign (w3[2], w3[3], offset);
      w6[2] = hc_bytealign (w3[1], w3[2], offset);
      w6[1] = hc_bytealign (w3[0], w3[1], offset);
      w6[0] = hc_bytealign (w2[3], w3[0], offset);
      w5[3] = hc_bytealign (w2[2], w2[3], offset);
      w5[2] = hc_bytealign (w2[1], w2[2], offset);
      w5[1] = hc_bytealign (w2[0], w2[1], offset);
      w5[0] = hc_bytealign (w1[3], w2[0], offset);
      w4[3] = hc_bytealign (w1[2], w1[3], offset);
      w4[2] = hc_bytealign (w1[1], w1[2], offset);
      w4[1] = hc_bytealign (w1[0], w1[1], offset);
      w4[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w7[3],     0, offset);
      c3[0] = hc_bytealign (w7[2], w7[3], offset);
      c2[3] = hc_bytealign (w7[1], w7[2], offset);
      c2[2] = hc_bytealign (w7[0], w7[1], offset);
      c2[1] = hc_bytealign (w6[3], w7[0], offset);
      c2[0] = hc_bytealign (w6[2], w6[3], offset);
      c1[3] = hc_bytealign (w6[1], w6[2], offset);
      c1[2] = hc_bytealign (w6[0], w6[1], offset);
      c1[1] = hc_bytealign (w5[3], w6[0], offset);
      c1[0] = hc_bytealign (w5[2], w5[3], offset);
      c0[3] = hc_bytealign (w5[1], w5[2], offset);
      c0[2] = hc_bytealign (w5[0], w5[1], offset);
      c0[1] = hc_bytealign (w4[3], w5[0], offset);
      c0[0] = hc_bytealign (w4[2], w4[3], offset);
      w7[3] = hc_bytealign (w4[1], w4[2], offset);
      w7[2] = hc_bytealign (w4[0], w4[1], offset);
      w7[1] = hc_bytealign (w3[3], w4[0], offset);
      w7[0] = hc_bytealign (w3[2], w3[3], offset);
      w6[3] = hc_bytealign (w3[1], w3[2], offset);
      w6[2] = hc_bytealign (w3[0], w3[1], offset);
      w6[1] = hc_bytealign (w2[3], w3[0], offset);
      w6[0] = hc_bytealign (w2[2], w2[3], offset);
      w5[3] = hc_bytealign (w2[1], w2[2], offset);
      w5[2] = hc_bytealign (w2[0], w2[1], offset);
      w5[1] = hc_bytealign (w1[3], w2[0], offset);
      w5[0] = hc_bytealign (w1[2], w1[3], offset);
      w4[3] = hc_bytealign (w1[1], w1[2], offset);
      w4[2] = hc_bytealign (w1[0], w1[1], offset);
      w4[1] = hc_bytealign (w0[3], w1[0], offset);
      w4[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w7[3],     0, offset);
      c3[1] = hc_bytealign (w7[2], w7[3], offset);
      c3[0] = hc_bytealign (w7[1], w7[2], offset);
      c2[3] = hc_bytealign (w7[0], w7[1], offset);
      c2[2] = hc_bytealign (w6[3], w7[0], offset);
      c2[1] = hc_bytealign (w6[2], w6[3], offset);
      c2[0] = hc_bytealign (w6[1], w6[2], offset);
      c1[3] = hc_bytealign (w6[0], w6[1], offset);
      c1[2] = hc_bytealign (w5[3], w6[0], offset);
      c1[1] = hc_bytealign (w5[2], w5[3], offset);
      c1[0] = hc_bytealign (w5[1], w5[2], offset);
      c0[3] = hc_bytealign (w5[0], w5[1], offset);
      c0[2] = hc_bytealign (w4[3], w5[0], offset);
      c0[1] = hc_bytealign (w4[2], w4[3], offset);
      c0[0] = hc_bytealign (w4[1], w4[2], offset);
      w7[3] = hc_bytealign (w4[0], w4[1], offset);
      w7[2] = hc_bytealign (w3[3], w4[0], offset);
      w7[1] = hc_bytealign (w3[2], w3[3], offset);
      w7[0] = hc_bytealign (w3[1], w3[2], offset);
      w6[3] = hc_bytealign (w3[0], w3[1], offset);
      w6[2] = hc_bytealign (w2[3], w3[0], offset);
      w6[1] = hc_bytealign (w2[2], w2[3], offset);
      w6[0] = hc_bytealign (w2[1], w2[2], offset);
      w5[3] = hc_bytealign (w2[0], w2[1], offset);
      w5[2] = hc_bytealign (w1[3], w2[0], offset);
      w5[1] = hc_bytealign (w1[2], w1[3], offset);
      w5[0] = hc_bytealign (w1[1], w1[2], offset);
      w4[3] = hc_bytealign (w1[0], w1[1], offset);
      w4[2] = hc_bytealign (w0[3], w1[0], offset);
      w4[1] = hc_bytealign (w0[2], w0[3], offset);
      w4[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w7[3],     0, offset);
      c3[2] = hc_bytealign (w7[2], w7[3], offset);
      c3[1] = hc_bytealign (w7[1], w7[2], offset);
      c3[0] = hc_bytealign (w7[0], w7[1], offset);
      c2[3] = hc_bytealign (w6[3], w7[0], offset);
      c2[2] = hc_bytealign (w6[2], w6[3], offset);
      c2[1] = hc_bytealign (w6[1], w6[2], offset);
      c2[0] = hc_bytealign (w6[0], w6[1], offset);
      c1[3] = hc_bytealign (w5[3], w6[0], offset);
      c1[2] = hc_bytealign (w5[2], w5[3], offset);
      c1[1] = hc_bytealign (w5[1], w5[2], offset);
      c1[0] = hc_bytealign (w5[0], w5[1], offset);
      c0[3] = hc_bytealign (w4[3], w5[0], offset);
      c0[2] = hc_bytealign (w4[2], w4[3], offset);
      c0[1] = hc_bytealign (w4[1], w4[2], offset);
      c0[0] = hc_bytealign (w4[0], w4[1], offset);
      w7[3] = hc_bytealign (w3[3], w4[0], offset);
      w7[2] = hc_bytealign (w3[2], w3[3], offset);
      w7[1] = hc_bytealign (w3[1], w3[2], offset);
      w7[0] = hc_bytealign (w3[0], w3[1], offset);
      w6[3] = hc_bytealign (w2[3], w3[0], offset);
      w6[2] = hc_bytealign (w2[2], w2[3], offset);
      w6[1] = hc_bytealign (w2[1], w2[2], offset);
      w6[0] = hc_bytealign (w2[0], w2[1], offset);
      w5[3] = hc_bytealign (w1[3], w2[0], offset);
      w5[2] = hc_bytealign (w1[2], w1[3], offset);
      w5[1] = hc_bytealign (w1[1], w1[2], offset);
      w5[0] = hc_bytealign (w1[0], w1[1], offset);
      w4[3] = hc_bytealign (w0[3], w1[0], offset);
      w4[2] = hc_bytealign (w0[2], w0[3], offset);
      w4[1] = hc_bytealign (w0[1], w0[2], offset);
      w4[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_bytealign (w7[3],     0, offset);
      c3[3] = hc_bytealign (w7[2], w7[3], offset);
      c3[2] = hc_bytealign (w7[1], w7[2], offset);
      c3[1] = hc_bytealign (w7[0], w7[1], offset);
      c3[0] = hc_bytealign (w6[3], w7[0], offset);
      c2[3] = hc_bytealign (w6[2], w6[3], offset);
      c2[2] = hc_bytealign (w6[1], w6[2], offset);
      c2[1] = hc_bytealign (w6[0], w6[1], offset);
      c2[0] = hc_bytealign (w5[3], w6[0], offset);
      c1[3] = hc_bytealign (w5[2], w5[3], offset);
      c1[2] = hc_bytealign (w5[1], w5[2], offset);
      c1[1] = hc_bytealign (w5[0], w5[1], offset);
      c1[0] = hc_bytealign (w4[3], w5[0], offset);
      c0[3] = hc_bytealign (w4[2], w4[3], offset);
      c0[2] = hc_bytealign (w4[1], w4[2], offset);
      c0[1] = hc_bytealign (w4[0], w4[1], offset);
      c0[0] = hc_bytealign (w3[3], w4[0], offset);
      w7[3] = hc_bytealign (w3[2], w3[3], offset);
      w7[2] = hc_bytealign (w3[1], w3[2], offset);
      w7[1] = hc_bytealign (w3[0], w3[1], offset);
      w7[0] = hc_bytealign (w2[3], w3[0], offset);
      w6[3] = hc_bytealign (w2[2], w2[3], offset);
      w6[2] = hc_bytealign (w2[1], w2[2], offset);
      w6[1] = hc_bytealign (w2[0], w2[1], offset);
      w6[0] = hc_bytealign (w1[3], w2[0], offset);
      w5[3] = hc_bytealign (w1[2], w1[3], offset);
      w5[2] = hc_bytealign (w1[1], w1[2], offset);
      w5[1] = hc_bytealign (w1[0], w1[1], offset);
      w5[0] = hc_bytealign (w0[3], w1[0], offset);
      w4[3] = hc_bytealign (w0[2], w0[3], offset);
      w4[2] = hc_bytealign (w0[1], w0[2], offset);
      w4[1] = hc_bytealign (w0[0], w0[1], offset);
      w4[0] = hc_bytealign (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_bytealign (w7[3],     0, offset);
      c4[0] = hc_bytealign (w7[2], w7[3], offset);
      c3[3] = hc_bytealign (w7[1], w7[2], offset);
      c3[2] = hc_bytealign (w7[0], w7[1], offset);
      c3[1] = hc_bytealign (w6[3], w7[0], offset);
      c3[0] = hc_bytealign (w6[2], w6[3], offset);
      c2[3] = hc_bytealign (w6[1], w6[2], offset);
      c2[2] = hc_bytealign (w6[0], w6[1], offset);
      c2[1] = hc_bytealign (w5[3], w6[0], offset);
      c2[0] = hc_bytealign (w5[2], w5[3], offset);
      c1[3] = hc_bytealign (w5[1], w5[2], offset);
      c1[2] = hc_bytealign (w5[0], w5[1], offset);
      c1[1] = hc_bytealign (w4[3], w5[0], offset);
      c1[0] = hc_bytealign (w4[2], w4[3], offset);
      c0[3] = hc_bytealign (w4[1], w4[2], offset);
      c0[2] = hc_bytealign (w4[0], w4[1], offset);
      c0[1] = hc_bytealign (w3[3], w4[0], offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w7[3] = hc_bytealign (w3[1], w3[2], offset);
      w7[2] = hc_bytealign (w3[0], w3[1], offset);
      w7[1] = hc_bytealign (w2[3], w3[0], offset);
      w7[0] = hc_bytealign (w2[2], w2[3], offset);
      w6[3] = hc_bytealign (w2[1], w2[2], offset);
      w6[2] = hc_bytealign (w2[0], w2[1], offset);
      w6[1] = hc_bytealign (w1[3], w2[0], offset);
      w6[0] = hc_bytealign (w1[2], w1[3], offset);
      w5[3] = hc_bytealign (w1[1], w1[2], offset);
      w5[2] = hc_bytealign (w1[0], w1[1], offset);
      w5[1] = hc_bytealign (w0[3], w1[0], offset);
      w5[0] = hc_bytealign (w0[2], w0[3], offset);
      w4[3] = hc_bytealign (w0[1], w0[2], offset);
      w4[2] = hc_bytealign (w0[0], w0[1], offset);
      w4[1] = hc_bytealign (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_bytealign (w7[3],     0, offset);
      c4[1] = hc_bytealign (w7[2], w7[3], offset);
      c4[0] = hc_bytealign (w7[1], w7[2], offset);
      c3[3] = hc_bytealign (w7[0], w7[1], offset);
      c3[2] = hc_bytealign (w6[3], w7[0], offset);
      c3[1] = hc_bytealign (w6[2], w6[3], offset);
      c3[0] = hc_bytealign (w6[1], w6[2], offset);
      c2[3] = hc_bytealign (w6[0], w6[1], offset);
      c2[2] = hc_bytealign (w5[3], w6[0], offset);
      c2[1] = hc_bytealign (w5[2], w5[3], offset);
      c2[0] = hc_bytealign (w5[1], w5[2], offset);
      c1[3] = hc_bytealign (w5[0], w5[1], offset);
      c1[2] = hc_bytealign (w4[3], w5[0], offset);
      c1[1] = hc_bytealign (w4[2], w4[3], offset);
      c1[0] = hc_bytealign (w4[1], w4[2], offset);
      c0[3] = hc_bytealign (w4[0], w4[1], offset);
      c0[2] = hc_bytealign (w3[3], w4[0], offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w7[3] = hc_bytealign (w3[0], w3[1], offset);
      w7[2] = hc_bytealign (w2[3], w3[0], offset);
      w7[1] = hc_bytealign (w2[2], w2[3], offset);
      w7[0] = hc_bytealign (w2[1], w2[2], offset);
      w6[3] = hc_bytealign (w2[0], w2[1], offset);
      w6[2] = hc_bytealign (w1[3], w2[0], offset);
      w6[1] = hc_bytealign (w1[2], w1[3], offset);
      w6[0] = hc_bytealign (w1[1], w1[2], offset);
      w5[3] = hc_bytealign (w1[0], w1[1], offset);
      w5[2] = hc_bytealign (w0[3], w1[0], offset);
      w5[1] = hc_bytealign (w0[2], w0[3], offset);
      w5[0] = hc_bytealign (w0[1], w0[2], offset);
      w4[3] = hc_bytealign (w0[0], w0[1], offset);
      w4[2] = hc_bytealign (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_bytealign (w7[3],     0, offset);
      c4[2] = hc_bytealign (w7[2], w7[3], offset);
      c4[1] = hc_bytealign (w7[1], w7[2], offset);
      c4[0] = hc_bytealign (w7[0], w7[1], offset);
      c3[3] = hc_bytealign (w6[3], w7[0], offset);
      c3[2] = hc_bytealign (w6[2], w6[3], offset);
      c3[1] = hc_bytealign (w6[1], w6[2], offset);
      c3[0] = hc_bytealign (w6[0], w6[1], offset);
      c2[3] = hc_bytealign (w5[3], w6[0], offset);
      c2[2] = hc_bytealign (w5[2], w5[3], offset);
      c2[1] = hc_bytealign (w5[1], w5[2], offset);
      c2[0] = hc_bytealign (w5[0], w5[1], offset);
      c1[3] = hc_bytealign (w4[3], w5[0], offset);
      c1[2] = hc_bytealign (w4[2], w4[3], offset);
      c1[1] = hc_bytealign (w4[1], w4[2], offset);
      c1[0] = hc_bytealign (w4[0], w4[1], offset);
      c0[3] = hc_bytealign (w3[3], w4[0], offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w7[3] = hc_bytealign (w2[3], w3[0], offset);
      w7[2] = hc_bytealign (w2[2], w2[3], offset);
      w7[1] = hc_bytealign (w2[1], w2[2], offset);
      w7[0] = hc_bytealign (w2[0], w2[1], offset);
      w6[3] = hc_bytealign (w1[3], w2[0], offset);
      w6[2] = hc_bytealign (w1[2], w1[3], offset);
      w6[1] = hc_bytealign (w1[1], w1[2], offset);
      w6[0] = hc_bytealign (w1[0], w1[1], offset);
      w5[3] = hc_bytealign (w0[3], w1[0], offset);
      w5[2] = hc_bytealign (w0[2], w0[3], offset);
      w5[1] = hc_bytealign (w0[1], w0[2], offset);
      w5[0] = hc_bytealign (w0[0], w0[1], offset);
      w4[3] = hc_bytealign (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_bytealign (w7[3],     0, offset);
      c4[3] = hc_bytealign (w7[2], w7[3], offset);
      c4[2] = hc_bytealign (w7[1], w7[2], offset);
      c4[1] = hc_bytealign (w7[0], w7[1], offset);
      c4[0] = hc_bytealign (w6[3], w7[0], offset);
      c3[3] = hc_bytealign (w6[2], w6[3], offset);
      c3[2] = hc_bytealign (w6[1], w6[2], offset);
      c3[1] = hc_bytealign (w6[0], w6[1], offset);
      c3[0] = hc_bytealign (w5[3], w6[0], offset);
      c2[3] = hc_bytealign (w5[2], w5[3], offset);
      c2[2] = hc_bytealign (w5[1], w5[2], offset);
      c2[1] = hc_bytealign (w5[0], w5[1], offset);
      c2[0] = hc_bytealign (w4[3], w5[0], offset);
      c1[3] = hc_bytealign (w4[2], w4[3], offset);
      c1[2] = hc_bytealign (w4[1], w4[2], offset);
      c1[1] = hc_bytealign (w4[0], w4[1], offset);
      c1[0] = hc_bytealign (w3[3], w4[0], offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w7[3] = hc_bytealign (w2[2], w2[3], offset);
      w7[2] = hc_bytealign (w2[1], w2[2], offset);
      w7[1] = hc_bytealign (w2[0], w2[1], offset);
      w7[0] = hc_bytealign (w1[3], w2[0], offset);
      w6[3] = hc_bytealign (w1[2], w1[3], offset);
      w6[2] = hc_bytealign (w1[1], w1[2], offset);
      w6[1] = hc_bytealign (w1[0], w1[1], offset);
      w6[0] = hc_bytealign (w0[3], w1[0], offset);
      w5[3] = hc_bytealign (w0[2], w0[3], offset);
      w5[2] = hc_bytealign (w0[1], w0[2], offset);
      w5[1] = hc_bytealign (w0[0], w0[1], offset);
      w5[0] = hc_bytealign (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_bytealign (w7[3],     0, offset);
      c5[0] = hc_bytealign (w7[2], w7[3], offset);
      c4[3] = hc_bytealign (w7[1], w7[2], offset);
      c4[2] = hc_bytealign (w7[0], w7[1], offset);
      c4[1] = hc_bytealign (w6[3], w7[0], offset);
      c4[0] = hc_bytealign (w6[2], w6[3], offset);
      c3[3] = hc_bytealign (w6[1], w6[2], offset);
      c3[2] = hc_bytealign (w6[0], w6[1], offset);
      c3[1] = hc_bytealign (w5[3], w6[0], offset);
      c3[0] = hc_bytealign (w5[2], w5[3], offset);
      c2[3] = hc_bytealign (w5[1], w5[2], offset);
      c2[2] = hc_bytealign (w5[0], w5[1], offset);
      c2[1] = hc_bytealign (w4[3], w5[0], offset);
      c2[0] = hc_bytealign (w4[2], w4[3], offset);
      c1[3] = hc_bytealign (w4[1], w4[2], offset);
      c1[2] = hc_bytealign (w4[0], w4[1], offset);
      c1[1] = hc_bytealign (w3[3], w4[0], offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w7[3] = hc_bytealign (w2[1], w2[2], offset);
      w7[2] = hc_bytealign (w2[0], w2[1], offset);
      w7[1] = hc_bytealign (w1[3], w2[0], offset);
      w7[0] = hc_bytealign (w1[2], w1[3], offset);
      w6[3] = hc_bytealign (w1[1], w1[2], offset);
      w6[2] = hc_bytealign (w1[0], w1[1], offset);
      w6[1] = hc_bytealign (w0[3], w1[0], offset);
      w6[0] = hc_bytealign (w0[2], w0[3], offset);
      w5[3] = hc_bytealign (w0[1], w0[2], offset);
      w5[2] = hc_bytealign (w0[0], w0[1], offset);
      w5[1] = hc_bytealign (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_bytealign (w7[3],     0, offset);
      c5[1] = hc_bytealign (w7[2], w7[3], offset);
      c5[0] = hc_bytealign (w7[1], w7[2], offset);
      c4[3] = hc_bytealign (w7[0], w7[1], offset);
      c4[2] = hc_bytealign (w6[3], w7[0], offset);
      c4[1] = hc_bytealign (w6[2], w6[3], offset);
      c4[0] = hc_bytealign (w6[1], w6[2], offset);
      c3[3] = hc_bytealign (w6[0], w6[1], offset);
      c3[2] = hc_bytealign (w5[3], w6[0], offset);
      c3[1] = hc_bytealign (w5[2], w5[3], offset);
      c3[0] = hc_bytealign (w5[1], w5[2], offset);
      c2[3] = hc_bytealign (w5[0], w5[1], offset);
      c2[2] = hc_bytealign (w4[3], w5[0], offset);
      c2[1] = hc_bytealign (w4[2], w4[3], offset);
      c2[0] = hc_bytealign (w4[1], w4[2], offset);
      c1[3] = hc_bytealign (w4[0], w4[1], offset);
      c1[2] = hc_bytealign (w3[3], w4[0], offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w7[3] = hc_bytealign (w2[0], w2[1], offset);
      w7[2] = hc_bytealign (w1[3], w2[0], offset);
      w7[1] = hc_bytealign (w1[2], w1[3], offset);
      w7[0] = hc_bytealign (w1[1], w1[2], offset);
      w6[3] = hc_bytealign (w1[0], w1[1], offset);
      w6[2] = hc_bytealign (w0[3], w1[0], offset);
      w6[1] = hc_bytealign (w0[2], w0[3], offset);
      w6[0] = hc_bytealign (w0[1], w0[2], offset);
      w5[3] = hc_bytealign (w0[0], w0[1], offset);
      w5[2] = hc_bytealign (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_bytealign (w7[3],     0, offset);
      c5[2] = hc_bytealign (w7[2], w7[3], offset);
      c5[1] = hc_bytealign (w7[1], w7[2], offset);
      c5[0] = hc_bytealign (w7[0], w7[1], offset);
      c4[3] = hc_bytealign (w6[3], w7[0], offset);
      c4[2] = hc_bytealign (w6[2], w6[3], offset);
      c4[1] = hc_bytealign (w6[1], w6[2], offset);
      c4[0] = hc_bytealign (w6[0], w6[1], offset);
      c3[3] = hc_bytealign (w5[3], w6[0], offset);
      c3[2] = hc_bytealign (w5[2], w5[3], offset);
      c3[1] = hc_bytealign (w5[1], w5[2], offset);
      c3[0] = hc_bytealign (w5[0], w5[1], offset);
      c2[3] = hc_bytealign (w4[3], w5[0], offset);
      c2[2] = hc_bytealign (w4[2], w4[3], offset);
      c2[1] = hc_bytealign (w4[1], w4[2], offset);
      c2[0] = hc_bytealign (w4[0], w4[1], offset);
      c1[3] = hc_bytealign (w3[3], w4[0], offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w7[3] = hc_bytealign (w1[3], w2[0], offset);
      w7[2] = hc_bytealign (w1[2], w1[3], offset);
      w7[1] = hc_bytealign (w1[1], w1[2], offset);
      w7[0] = hc_bytealign (w1[0], w1[1], offset);
      w6[3] = hc_bytealign (w0[3], w1[0], offset);
      w6[2] = hc_bytealign (w0[2], w0[3], offset);
      w6[1] = hc_bytealign (w0[1], w0[2], offset);
      w6[0] = hc_bytealign (w0[0], w0[1], offset);
      w5[3] = hc_bytealign (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_bytealign (w7[3],     0, offset);
      c5[3] = hc_bytealign (w7[2], w7[3], offset);
      c5[2] = hc_bytealign (w7[1], w7[2], offset);
      c5[1] = hc_bytealign (w7[0], w7[1], offset);
      c5[0] = hc_bytealign (w6[3], w7[0], offset);
      c4[3] = hc_bytealign (w6[2], w6[3], offset);
      c4[2] = hc_bytealign (w6[1], w6[2], offset);
      c4[1] = hc_bytealign (w6[0], w6[1], offset);
      c4[0] = hc_bytealign (w5[3], w6[0], offset);
      c3[3] = hc_bytealign (w5[2], w5[3], offset);
      c3[2] = hc_bytealign (w5[1], w5[2], offset);
      c3[1] = hc_bytealign (w5[0], w5[1], offset);
      c3[0] = hc_bytealign (w4[3], w5[0], offset);
      c2[3] = hc_bytealign (w4[2], w4[3], offset);
      c2[2] = hc_bytealign (w4[1], w4[2], offset);
      c2[1] = hc_bytealign (w4[0], w4[1], offset);
      c2[0] = hc_bytealign (w3[3], w4[0], offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w7[3] = hc_bytealign (w1[2], w1[3], offset);
      w7[2] = hc_bytealign (w1[1], w1[2], offset);
      w7[1] = hc_bytealign (w1[0], w1[1], offset);
      w7[0] = hc_bytealign (w0[3], w1[0], offset);
      w6[3] = hc_bytealign (w0[2], w0[3], offset);
      w6[2] = hc_bytealign (w0[1], w0[2], offset);
      w6[1] = hc_bytealign (w0[0], w0[1], offset);
      w6[0] = hc_bytealign (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_bytealign (w7[3],     0, offset);
      c6[0] = hc_bytealign (w7[2], w7[3], offset);
      c5[3] = hc_bytealign (w7[1], w7[2], offset);
      c5[2] = hc_bytealign (w7[0], w7[1], offset);
      c5[1] = hc_bytealign (w6[3], w7[0], offset);
      c5[0] = hc_bytealign (w6[2], w6[3], offset);
      c4[3] = hc_bytealign (w6[1], w6[2], offset);
      c4[2] = hc_bytealign (w6[0], w6[1], offset);
      c4[1] = hc_bytealign (w5[3], w6[0], offset);
      c4[0] = hc_bytealign (w5[2], w5[3], offset);
      c3[3] = hc_bytealign (w5[1], w5[2], offset);
      c3[2] = hc_bytealign (w5[0], w5[1], offset);
      c3[1] = hc_bytealign (w4[3], w5[0], offset);
      c3[0] = hc_bytealign (w4[2], w4[3], offset);
      c2[3] = hc_bytealign (w4[1], w4[2], offset);
      c2[2] = hc_bytealign (w4[0], w4[1], offset);
      c2[1] = hc_bytealign (w3[3], w4[0], offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w7[3] = hc_bytealign (w1[1], w1[2], offset);
      w7[2] = hc_bytealign (w1[0], w1[1], offset);
      w7[1] = hc_bytealign (w0[3], w1[0], offset);
      w7[0] = hc_bytealign (w0[2], w0[3], offset);
      w6[3] = hc_bytealign (w0[1], w0[2], offset);
      w6[2] = hc_bytealign (w0[0], w0[1], offset);
      w6[1] = hc_bytealign (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_bytealign (w7[3],     0, offset);
      c6[1] = hc_bytealign (w7[2], w7[3], offset);
      c6[0] = hc_bytealign (w7[1], w7[2], offset);
      c5[3] = hc_bytealign (w7[0], w7[1], offset);
      c5[2] = hc_bytealign (w6[3], w7[0], offset);
      c5[1] = hc_bytealign (w6[2], w6[3], offset);
      c5[0] = hc_bytealign (w6[1], w6[2], offset);
      c4[3] = hc_bytealign (w6[0], w6[1], offset);
      c4[2] = hc_bytealign (w5[3], w6[0], offset);
      c4[1] = hc_bytealign (w5[2], w5[3], offset);
      c4[0] = hc_bytealign (w5[1], w5[2], offset);
      c3[3] = hc_bytealign (w5[0], w5[1], offset);
      c3[2] = hc_bytealign (w4[3], w5[0], offset);
      c3[1] = hc_bytealign (w4[2], w4[3], offset);
      c3[0] = hc_bytealign (w4[1], w4[2], offset);
      c2[3] = hc_bytealign (w4[0], w4[1], offset);
      c2[2] = hc_bytealign (w3[3], w4[0], offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w7[3] = hc_bytealign (w1[0], w1[1], offset);
      w7[2] = hc_bytealign (w0[3], w1[0], offset);
      w7[1] = hc_bytealign (w0[2], w0[3], offset);
      w7[0] = hc_bytealign (w0[1], w0[2], offset);
      w6[3] = hc_bytealign (w0[0], w0[1], offset);
      w6[2] = hc_bytealign (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_bytealign (w7[3],     0, offset);
      c6[2] = hc_bytealign (w7[2], w7[3], offset);
      c6[1] = hc_bytealign (w7[1], w7[2], offset);
      c6[0] = hc_bytealign (w7[0], w7[1], offset);
      c5[3] = hc_bytealign (w6[3], w7[0], offset);
      c5[2] = hc_bytealign (w6[2], w6[3], offset);
      c5[1] = hc_bytealign (w6[1], w6[2], offset);
      c5[0] = hc_bytealign (w6[0], w6[1], offset);
      c4[3] = hc_bytealign (w5[3], w6[0], offset);
      c4[2] = hc_bytealign (w5[2], w5[3], offset);
      c4[1] = hc_bytealign (w5[1], w5[2], offset);
      c4[0] = hc_bytealign (w5[0], w5[1], offset);
      c3[3] = hc_bytealign (w4[3], w5[0], offset);
      c3[2] = hc_bytealign (w4[2], w4[3], offset);
      c3[1] = hc_bytealign (w4[1], w4[2], offset);
      c3[0] = hc_bytealign (w4[0], w4[1], offset);
      c2[3] = hc_bytealign (w3[3], w4[0], offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w7[3] = hc_bytealign (w0[3], w1[0], offset);
      w7[2] = hc_bytealign (w0[2], w0[3], offset);
      w7[1] = hc_bytealign (w0[1], w0[2], offset);
      w7[0] = hc_bytealign (w0[0], w0[1], offset);
      w6[3] = hc_bytealign (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_bytealign (w7[3],     0, offset);
      c6[3] = hc_bytealign (w7[2], w7[3], offset);
      c6[2] = hc_bytealign (w7[1], w7[2], offset);
      c6[1] = hc_bytealign (w7[0], w7[1], offset);
      c6[0] = hc_bytealign (w6[3], w7[0], offset);
      c5[3] = hc_bytealign (w6[2], w6[3], offset);
      c5[2] = hc_bytealign (w6[1], w6[2], offset);
      c5[1] = hc_bytealign (w6[0], w6[1], offset);
      c5[0] = hc_bytealign (w5[3], w6[0], offset);
      c4[3] = hc_bytealign (w5[2], w5[3], offset);
      c4[2] = hc_bytealign (w5[1], w5[2], offset);
      c4[1] = hc_bytealign (w5[0], w5[1], offset);
      c4[0] = hc_bytealign (w4[3], w5[0], offset);
      c3[3] = hc_bytealign (w4[2], w4[3], offset);
      c3[2] = hc_bytealign (w4[1], w4[2], offset);
      c3[1] = hc_bytealign (w4[0], w4[1], offset);
      c3[0] = hc_bytealign (w3[3], w4[0], offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w7[3] = hc_bytealign (w0[2], w0[3], offset);
      w7[2] = hc_bytealign (w0[1], w0[2], offset);
      w7[1] = hc_bytealign (w0[0], w0[1], offset);
      w7[0] = hc_bytealign (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_bytealign (w7[3],     0, offset);
      c7[0] = hc_bytealign (w7[2], w7[3], offset);
      c6[3] = hc_bytealign (w7[1], w7[2], offset);
      c6[2] = hc_bytealign (w7[0], w7[1], offset);
      c6[1] = hc_bytealign (w6[3], w7[0], offset);
      c6[0] = hc_bytealign (w6[2], w6[3], offset);
      c5[3] = hc_bytealign (w6[1], w6[2], offset);
      c5[2] = hc_bytealign (w6[0], w6[1], offset);
      c5[1] = hc_bytealign (w5[3], w6[0], offset);
      c5[0] = hc_bytealign (w5[2], w5[3], offset);
      c4[3] = hc_bytealign (w5[1], w5[2], offset);
      c4[2] = hc_bytealign (w5[0], w5[1], offset);
      c4[1] = hc_bytealign (w4[3], w5[0], offset);
      c4[0] = hc_bytealign (w4[2], w4[3], offset);
      c3[3] = hc_bytealign (w4[1], w4[2], offset);
      c3[2] = hc_bytealign (w4[0], w4[1], offset);
      c3[1] = hc_bytealign (w3[3], w4[0], offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w7[3] = hc_bytealign (w0[1], w0[2], offset);
      w7[2] = hc_bytealign (w0[0], w0[1], offset);
      w7[1] = hc_bytealign (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_bytealign (w7[3],     0, offset);
      c7[1] = hc_bytealign (w7[2], w7[3], offset);
      c7[0] = hc_bytealign (w7[1], w7[2], offset);
      c6[3] = hc_bytealign (w7[0], w7[1], offset);
      c6[2] = hc_bytealign (w6[3], w7[0], offset);
      c6[1] = hc_bytealign (w6[2], w6[3], offset);
      c6[0] = hc_bytealign (w6[1], w6[2], offset);
      c5[3] = hc_bytealign (w6[0], w6[1], offset);
      c5[2] = hc_bytealign (w5[3], w6[0], offset);
      c5[1] = hc_bytealign (w5[2], w5[3], offset);
      c5[0] = hc_bytealign (w5[1], w5[2], offset);
      c4[3] = hc_bytealign (w5[0], w5[1], offset);
      c4[2] = hc_bytealign (w4[3], w5[0], offset);
      c4[1] = hc_bytealign (w4[2], w4[3], offset);
      c4[0] = hc_bytealign (w4[1], w4[2], offset);
      c3[3] = hc_bytealign (w4[0], w4[1], offset);
      c3[2] = hc_bytealign (w3[3], w4[0], offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w7[3] = hc_bytealign (w0[0], w0[1], offset);
      w7[2] = hc_bytealign (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_bytealign (w7[3],     0, offset);
      c7[2] = hc_bytealign (w7[2], w7[3], offset);
      c7[1] = hc_bytealign (w7[1], w7[2], offset);
      c7[0] = hc_bytealign (w7[0], w7[1], offset);
      c6[3] = hc_bytealign (w6[3], w7[0], offset);
      c6[2] = hc_bytealign (w6[2], w6[3], offset);
      c6[1] = hc_bytealign (w6[1], w6[2], offset);
      c6[0] = hc_bytealign (w6[0], w6[1], offset);
      c5[3] = hc_bytealign (w5[3], w6[0], offset);
      c5[2] = hc_bytealign (w5[2], w5[3], offset);
      c5[1] = hc_bytealign (w5[1], w5[2], offset);
      c5[0] = hc_bytealign (w5[0], w5[1], offset);
      c4[3] = hc_bytealign (w4[3], w5[0], offset);
      c4[2] = hc_bytealign (w4[2], w4[3], offset);
      c4[1] = hc_bytealign (w4[1], w4[2], offset);
      c4[0] = hc_bytealign (w4[0], w4[1], offset);
      c3[3] = hc_bytealign (w3[3], w4[0], offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w7[3] = hc_bytealign (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm (    0, w7[3], selector);
      w7[3] = hc_byte_perm (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm (    0, w7[3], selector);
      c0[0] = hc_byte_perm (w7[3], w7[2], selector);
      w7[3] = hc_byte_perm (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm (    0, w7[3], selector);
      c0[1] = hc_byte_perm (w7[3], w7[2], selector);
      c0[0] = hc_byte_perm (w7[2], w7[1], selector);
      w7[3] = hc_byte_perm (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm (    0, w7[3], selector);
      c0[2] = hc_byte_perm (w7[3], w7[2], selector);
      c0[1] = hc_byte_perm (w7[2], w7[1], selector);
      c0[0] = hc_byte_perm (w7[1], w7[0], selector);
      w7[3] = hc_byte_perm (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm (    0, w7[3], selector);
      c0[3] = hc_byte_perm (w7[3], w7[2], selector);
      c0[2] = hc_byte_perm (w7[2], w7[1], selector);
      c0[1] = hc_byte_perm (w7[1], w7[0], selector);
      c0[0] = hc_byte_perm (w7[0], w6[3], selector);
      w7[3] = hc_byte_perm (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm (    0, w7[3], selector);
      c1[0] = hc_byte_perm (w7[3], w7[2], selector);
      c0[3] = hc_byte_perm (w7[2], w7[1], selector);
      c0[2] = hc_byte_perm (w7[1], w7[0], selector);
      c0[1] = hc_byte_perm (w7[0], w6[3], selector);
      c0[0] = hc_byte_perm (w6[3], w6[2], selector);
      w7[3] = hc_byte_perm (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm (    0, w7[3], selector);
      c1[1] = hc_byte_perm (w7[3], w7[2], selector);
      c1[0] = hc_byte_perm (w7[2], w7[1], selector);
      c0[3] = hc_byte_perm (w7[1], w7[0], selector);
      c0[2] = hc_byte_perm (w7[0], w6[3], selector);
      c0[1] = hc_byte_perm (w6[3], w6[2], selector);
      c0[0] = hc_byte_perm (w6[2], w6[1], selector);
      w7[3] = hc_byte_perm (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm (    0, w7[3], selector);
      c1[2] = hc_byte_perm (w7[3], w7[2], selector);
      c1[1] = hc_byte_perm (w7[2], w7[1], selector);
      c1[0] = hc_byte_perm (w7[1], w7[0], selector);
      c0[3] = hc_byte_perm (w7[0], w6[3], selector);
      c0[2] = hc_byte_perm (w6[3], w6[2], selector);
      c0[1] = hc_byte_perm (w6[2], w6[1], selector);
      c0[0] = hc_byte_perm (w6[1], w6[0], selector);
      w7[3] = hc_byte_perm (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm (    0, w7[3], selector);
      c1[3] = hc_byte_perm (w7[3], w7[2], selector);
      c1[2] = hc_byte_perm (w7[2], w7[1], selector);
      c1[1] = hc_byte_perm (w7[1], w7[0], selector);
      c1[0] = hc_byte_perm (w7[0], w6[3], selector);
      c0[3] = hc_byte_perm (w6[3], w6[2], selector);
      c0[2] = hc_byte_perm (w6[2], w6[1], selector);
      c0[1] = hc_byte_perm (w6[1], w6[0], selector);
      c0[0] = hc_byte_perm (w6[0], w5[3], selector);
      w7[3] = hc_byte_perm (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm (    0, w7[3], selector);
      c2[0] = hc_byte_perm (w7[3], w7[2], selector);
      c1[3] = hc_byte_perm (w7[2], w7[1], selector);
      c1[2] = hc_byte_perm (w7[1], w7[0], selector);
      c1[1] = hc_byte_perm (w7[0], w6[3], selector);
      c1[0] = hc_byte_perm (w6[3], w6[2], selector);
      c0[3] = hc_byte_perm (w6[2], w6[1], selector);
      c0[2] = hc_byte_perm (w6[1], w6[0], selector);
      c0[1] = hc_byte_perm (w6[0], w5[3], selector);
      c0[0] = hc_byte_perm (w5[3], w5[2], selector);
      w7[3] = hc_byte_perm (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm (    0, w7[3], selector);
      c2[1] = hc_byte_perm (w7[3], w7[2], selector);
      c2[0] = hc_byte_perm (w7[2], w7[1], selector);
      c1[3] = hc_byte_perm (w7[1], w7[0], selector);
      c1[2] = hc_byte_perm (w7[0], w6[3], selector);
      c1[1] = hc_byte_perm (w6[3], w6[2], selector);
      c1[0] = hc_byte_perm (w6[2], w6[1], selector);
      c0[3] = hc_byte_perm (w6[1], w6[0], selector);
      c0[2] = hc_byte_perm (w6[0], w5[3], selector);
      c0[1] = hc_byte_perm (w5[3], w5[2], selector);
      c0[0] = hc_byte_perm (w5[2], w5[1], selector);
      w7[3] = hc_byte_perm (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm (    0, w7[3], selector);
      c2[2] = hc_byte_perm (w7[3], w7[2], selector);
      c2[1] = hc_byte_perm (w7[2], w7[1], selector);
      c2[0] = hc_byte_perm (w7[1], w7[0], selector);
      c1[3] = hc_byte_perm (w7[0], w6[3], selector);
      c1[2] = hc_byte_perm (w6[3], w6[2], selector);
      c1[1] = hc_byte_perm (w6[2], w6[1], selector);
      c1[0] = hc_byte_perm (w6[1], w6[0], selector);
      c0[3] = hc_byte_perm (w6[0], w5[3], selector);
      c0[2] = hc_byte_perm (w5[3], w5[2], selector);
      c0[1] = hc_byte_perm (w5[2], w5[1], selector);
      c0[0] = hc_byte_perm (w5[1], w5[0], selector);
      w7[3] = hc_byte_perm (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm (    0, w7[3], selector);
      c2[3] = hc_byte_perm (w7[3], w7[2], selector);
      c2[2] = hc_byte_perm (w7[2], w7[1], selector);
      c2[1] = hc_byte_perm (w7[1], w7[0], selector);
      c2[0] = hc_byte_perm (w7[0], w6[3], selector);
      c1[3] = hc_byte_perm (w6[3], w6[2], selector);
      c1[2] = hc_byte_perm (w6[2], w6[1], selector);
      c1[1] = hc_byte_perm (w6[1], w6[0], selector);
      c1[0] = hc_byte_perm (w6[0], w5[3], selector);
      c0[3] = hc_byte_perm (w5[3], w5[2], selector);
      c0[2] = hc_byte_perm (w5[2], w5[1], selector);
      c0[1] = hc_byte_perm (w5[1], w5[0], selector);
      c0[0] = hc_byte_perm (w5[0], w4[3], selector);
      w7[3] = hc_byte_perm (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm (    0, w7[3], selector);
      c3[0] = hc_byte_perm (w7[3], w7[2], selector);
      c2[3] = hc_byte_perm (w7[2], w7[1], selector);
      c2[2] = hc_byte_perm (w7[1], w7[0], selector);
      c2[1] = hc_byte_perm (w7[0], w6[3], selector);
      c2[0] = hc_byte_perm (w6[3], w6[2], selector);
      c1[3] = hc_byte_perm (w6[2], w6[1], selector);
      c1[2] = hc_byte_perm (w6[1], w6[0], selector);
      c1[1] = hc_byte_perm (w6[0], w5[3], selector);
      c1[0] = hc_byte_perm (w5[3], w5[2], selector);
      c0[3] = hc_byte_perm (w5[2], w5[1], selector);
      c0[2] = hc_byte_perm (w5[1], w5[0], selector);
      c0[1] = hc_byte_perm (w5[0], w4[3], selector);
      c0[0] = hc_byte_perm (w4[3], w4[2], selector);
      w7[3] = hc_byte_perm (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm (    0, w7[3], selector);
      c3[1] = hc_byte_perm (w7[3], w7[2], selector);
      c3[0] = hc_byte_perm (w7[2], w7[1], selector);
      c2[3] = hc_byte_perm (w7[1], w7[0], selector);
      c2[2] = hc_byte_perm (w7[0], w6[3], selector);
      c2[1] = hc_byte_perm (w6[3], w6[2], selector);
      c2[0] = hc_byte_perm (w6[2], w6[1], selector);
      c1[3] = hc_byte_perm (w6[1], w6[0], selector);
      c1[2] = hc_byte_perm (w6[0], w5[3], selector);
      c1[1] = hc_byte_perm (w5[3], w5[2], selector);
      c1[0] = hc_byte_perm (w5[2], w5[1], selector);
      c0[3] = hc_byte_perm (w5[1], w5[0], selector);
      c0[2] = hc_byte_perm (w5[0], w4[3], selector);
      c0[1] = hc_byte_perm (w4[3], w4[2], selector);
      c0[0] = hc_byte_perm (w4[2], w4[1], selector);
      w7[3] = hc_byte_perm (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm (    0, w7[3], selector);
      c3[2] = hc_byte_perm (w7[3], w7[2], selector);
      c3[1] = hc_byte_perm (w7[2], w7[1], selector);
      c3[0] = hc_byte_perm (w7[1], w7[0], selector);
      c2[3] = hc_byte_perm (w7[0], w6[3], selector);
      c2[2] = hc_byte_perm (w6[3], w6[2], selector);
      c2[1] = hc_byte_perm (w6[2], w6[1], selector);
      c2[0] = hc_byte_perm (w6[1], w6[0], selector);
      c1[3] = hc_byte_perm (w6[0], w5[3], selector);
      c1[2] = hc_byte_perm (w5[3], w5[2], selector);
      c1[1] = hc_byte_perm (w5[2], w5[1], selector);
      c1[0] = hc_byte_perm (w5[1], w5[0], selector);
      c0[3] = hc_byte_perm (w5[0], w4[3], selector);
      c0[2] = hc_byte_perm (w4[3], w4[2], selector);
      c0[1] = hc_byte_perm (w4[2], w4[1], selector);
      c0[0] = hc_byte_perm (w4[1], w4[0], selector);
      w7[3] = hc_byte_perm (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_byte_perm (    0, w7[3], selector);
      c3[3] = hc_byte_perm (w7[3], w7[2], selector);
      c3[2] = hc_byte_perm (w7[2], w7[1], selector);
      c3[1] = hc_byte_perm (w7[1], w7[0], selector);
      c3[0] = hc_byte_perm (w7[0], w6[3], selector);
      c2[3] = hc_byte_perm (w6[3], w6[2], selector);
      c2[2] = hc_byte_perm (w6[2], w6[1], selector);
      c2[1] = hc_byte_perm (w6[1], w6[0], selector);
      c2[0] = hc_byte_perm (w6[0], w5[3], selector);
      c1[3] = hc_byte_perm (w5[3], w5[2], selector);
      c1[2] = hc_byte_perm (w5[2], w5[1], selector);
      c1[1] = hc_byte_perm (w5[1], w5[0], selector);
      c1[0] = hc_byte_perm (w5[0], w4[3], selector);
      c0[3] = hc_byte_perm (w4[3], w4[2], selector);
      c0[2] = hc_byte_perm (w4[2], w4[1], selector);
      c0[1] = hc_byte_perm (w4[1], w4[0], selector);
      c0[0] = hc_byte_perm (w4[0], w3[3], selector);
      w7[3] = hc_byte_perm (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_byte_perm (    0, w7[3], selector);
      c4[0] = hc_byte_perm (w7[3], w7[2], selector);
      c3[3] = hc_byte_perm (w7[2], w7[1], selector);
      c3[2] = hc_byte_perm (w7[1], w7[0], selector);
      c3[1] = hc_byte_perm (w7[0], w6[3], selector);
      c3[0] = hc_byte_perm (w6[3], w6[2], selector);
      c2[3] = hc_byte_perm (w6[2], w6[1], selector);
      c2[2] = hc_byte_perm (w6[1], w6[0], selector);
      c2[1] = hc_byte_perm (w6[0], w5[3], selector);
      c2[0] = hc_byte_perm (w5[3], w5[2], selector);
      c1[3] = hc_byte_perm (w5[2], w5[1], selector);
      c1[2] = hc_byte_perm (w5[1], w5[0], selector);
      c1[1] = hc_byte_perm (w5[0], w4[3], selector);
      c1[0] = hc_byte_perm (w4[3], w4[2], selector);
      c0[3] = hc_byte_perm (w4[2], w4[1], selector);
      c0[2] = hc_byte_perm (w4[1], w4[0], selector);
      c0[1] = hc_byte_perm (w4[0], w3[3], selector);
      c0[0] = hc_byte_perm (w3[3], w3[2], selector);
      w7[3] = hc_byte_perm (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_byte_perm (    0, w7[3], selector);
      c4[1] = hc_byte_perm (w7[3], w7[2], selector);
      c4[0] = hc_byte_perm (w7[2], w7[1], selector);
      c3[3] = hc_byte_perm (w7[1], w7[0], selector);
      c3[2] = hc_byte_perm (w7[0], w6[3], selector);
      c3[1] = hc_byte_perm (w6[3], w6[2], selector);
      c3[0] = hc_byte_perm (w6[2], w6[1], selector);
      c2[3] = hc_byte_perm (w6[1], w6[0], selector);
      c2[2] = hc_byte_perm (w6[0], w5[3], selector);
      c2[1] = hc_byte_perm (w5[3], w5[2], selector);
      c2[0] = hc_byte_perm (w5[2], w5[1], selector);
      c1[3] = hc_byte_perm (w5[1], w5[0], selector);
      c1[2] = hc_byte_perm (w5[0], w4[3], selector);
      c1[1] = hc_byte_perm (w4[3], w4[2], selector);
      c1[0] = hc_byte_perm (w4[2], w4[1], selector);
      c0[3] = hc_byte_perm (w4[1], w4[0], selector);
      c0[2] = hc_byte_perm (w4[0], w3[3], selector);
      c0[1] = hc_byte_perm (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm (w3[2], w3[1], selector);
      w7[3] = hc_byte_perm (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_byte_perm (    0, w7[3], selector);
      c4[2] = hc_byte_perm (w7[3], w7[2], selector);
      c4[1] = hc_byte_perm (w7[2], w7[1], selector);
      c4[0] = hc_byte_perm (w7[1], w7[0], selector);
      c3[3] = hc_byte_perm (w7[0], w6[3], selector);
      c3[2] = hc_byte_perm (w6[3], w6[2], selector);
      c3[1] = hc_byte_perm (w6[2], w6[1], selector);
      c3[0] = hc_byte_perm (w6[1], w6[0], selector);
      c2[3] = hc_byte_perm (w6[0], w5[3], selector);
      c2[2] = hc_byte_perm (w5[3], w5[2], selector);
      c2[1] = hc_byte_perm (w5[2], w5[1], selector);
      c2[0] = hc_byte_perm (w5[1], w5[0], selector);
      c1[3] = hc_byte_perm (w5[0], w4[3], selector);
      c1[2] = hc_byte_perm (w4[3], w4[2], selector);
      c1[1] = hc_byte_perm (w4[2], w4[1], selector);
      c1[0] = hc_byte_perm (w4[1], w4[0], selector);
      c0[3] = hc_byte_perm (w4[0], w3[3], selector);
      c0[2] = hc_byte_perm (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm (w3[1], w3[0], selector);
      w7[3] = hc_byte_perm (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_byte_perm (    0, w7[3], selector);
      c4[3] = hc_byte_perm (w7[3], w7[2], selector);
      c4[2] = hc_byte_perm (w7[2], w7[1], selector);
      c4[1] = hc_byte_perm (w7[1], w7[0], selector);
      c4[0] = hc_byte_perm (w7[0], w6[3], selector);
      c3[3] = hc_byte_perm (w6[3], w6[2], selector);
      c3[2] = hc_byte_perm (w6[2], w6[1], selector);
      c3[1] = hc_byte_perm (w6[1], w6[0], selector);
      c3[0] = hc_byte_perm (w6[0], w5[3], selector);
      c2[3] = hc_byte_perm (w5[3], w5[2], selector);
      c2[2] = hc_byte_perm (w5[2], w5[1], selector);
      c2[1] = hc_byte_perm (w5[1], w5[0], selector);
      c2[0] = hc_byte_perm (w5[0], w4[3], selector);
      c1[3] = hc_byte_perm (w4[3], w4[2], selector);
      c1[2] = hc_byte_perm (w4[2], w4[1], selector);
      c1[1] = hc_byte_perm (w4[1], w4[0], selector);
      c1[0] = hc_byte_perm (w4[0], w3[3], selector);
      c0[3] = hc_byte_perm (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm (w3[0], w2[3], selector);
      w7[3] = hc_byte_perm (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_byte_perm (    0, w7[3], selector);
      c5[0] = hc_byte_perm (w7[3], w7[2], selector);
      c4[3] = hc_byte_perm (w7[2], w7[1], selector);
      c4[2] = hc_byte_perm (w7[1], w7[0], selector);
      c4[1] = hc_byte_perm (w7[0], w6[3], selector);
      c4[0] = hc_byte_perm (w6[3], w6[2], selector);
      c3[3] = hc_byte_perm (w6[2], w6[1], selector);
      c3[2] = hc_byte_perm (w6[1], w6[0], selector);
      c3[1] = hc_byte_perm (w6[0], w5[3], selector);
      c3[0] = hc_byte_perm (w5[3], w5[2], selector);
      c2[3] = hc_byte_perm (w5[2], w5[1], selector);
      c2[2] = hc_byte_perm (w5[1], w5[0], selector);
      c2[1] = hc_byte_perm (w5[0], w4[3], selector);
      c2[0] = hc_byte_perm (w4[3], w4[2], selector);
      c1[3] = hc_byte_perm (w4[2], w4[1], selector);
      c1[2] = hc_byte_perm (w4[1], w4[0], selector);
      c1[1] = hc_byte_perm (w4[0], w3[3], selector);
      c1[0] = hc_byte_perm (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm (w2[3], w2[2], selector);
      w7[3] = hc_byte_perm (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_byte_perm (    0, w7[3], selector);
      c5[1] = hc_byte_perm (w7[3], w7[2], selector);
      c5[0] = hc_byte_perm (w7[2], w7[1], selector);
      c4[3] = hc_byte_perm (w7[1], w7[0], selector);
      c4[2] = hc_byte_perm (w7[0], w6[3], selector);
      c4[1] = hc_byte_perm (w6[3], w6[2], selector);
      c4[0] = hc_byte_perm (w6[2], w6[1], selector);
      c3[3] = hc_byte_perm (w6[1], w6[0], selector);
      c3[2] = hc_byte_perm (w6[0], w5[3], selector);
      c3[1] = hc_byte_perm (w5[3], w5[2], selector);
      c3[0] = hc_byte_perm (w5[2], w5[1], selector);
      c2[3] = hc_byte_perm (w5[1], w5[0], selector);
      c2[2] = hc_byte_perm (w5[0], w4[3], selector);
      c2[1] = hc_byte_perm (w4[3], w4[2], selector);
      c2[0] = hc_byte_perm (w4[2], w4[1], selector);
      c1[3] = hc_byte_perm (w4[1], w4[0], selector);
      c1[2] = hc_byte_perm (w4[0], w3[3], selector);
      c1[1] = hc_byte_perm (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm (w2[2], w2[1], selector);
      w7[3] = hc_byte_perm (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_byte_perm (    0, w7[3], selector);
      c5[2] = hc_byte_perm (w7[3], w7[2], selector);
      c5[1] = hc_byte_perm (w7[2], w7[1], selector);
      c5[0] = hc_byte_perm (w7[1], w7[0], selector);
      c4[3] = hc_byte_perm (w7[0], w6[3], selector);
      c4[2] = hc_byte_perm (w6[3], w6[2], selector);
      c4[1] = hc_byte_perm (w6[2], w6[1], selector);
      c4[0] = hc_byte_perm (w6[1], w6[0], selector);
      c3[3] = hc_byte_perm (w6[0], w5[3], selector);
      c3[2] = hc_byte_perm (w5[3], w5[2], selector);
      c3[1] = hc_byte_perm (w5[2], w5[1], selector);
      c3[0] = hc_byte_perm (w5[1], w5[0], selector);
      c2[3] = hc_byte_perm (w5[0], w4[3], selector);
      c2[2] = hc_byte_perm (w4[3], w4[2], selector);
      c2[1] = hc_byte_perm (w4[2], w4[1], selector);
      c2[0] = hc_byte_perm (w4[1], w4[0], selector);
      c1[3] = hc_byte_perm (w4[0], w3[3], selector);
      c1[2] = hc_byte_perm (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm (w2[1], w2[0], selector);
      w7[3] = hc_byte_perm (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_byte_perm (    0, w7[3], selector);
      c5[3] = hc_byte_perm (w7[3], w7[2], selector);
      c5[2] = hc_byte_perm (w7[2], w7[1], selector);
      c5[1] = hc_byte_perm (w7[1], w7[0], selector);
      c5[0] = hc_byte_perm (w7[0], w6[3], selector);
      c4[3] = hc_byte_perm (w6[3], w6[2], selector);
      c4[2] = hc_byte_perm (w6[2], w6[1], selector);
      c4[1] = hc_byte_perm (w6[1], w6[0], selector);
      c4[0] = hc_byte_perm (w6[0], w5[3], selector);
      c3[3] = hc_byte_perm (w5[3], w5[2], selector);
      c3[2] = hc_byte_perm (w5[2], w5[1], selector);
      c3[1] = hc_byte_perm (w5[1], w5[0], selector);
      c3[0] = hc_byte_perm (w5[0], w4[3], selector);
      c2[3] = hc_byte_perm (w4[3], w4[2], selector);
      c2[2] = hc_byte_perm (w4[2], w4[1], selector);
      c2[1] = hc_byte_perm (w4[1], w4[0], selector);
      c2[0] = hc_byte_perm (w4[0], w3[3], selector);
      c1[3] = hc_byte_perm (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm (w2[0], w1[3], selector);
      w7[3] = hc_byte_perm (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_byte_perm (    0, w7[3], selector);
      c6[0] = hc_byte_perm (w7[3], w7[2], selector);
      c5[3] = hc_byte_perm (w7[2], w7[1], selector);
      c5[2] = hc_byte_perm (w7[1], w7[0], selector);
      c5[1] = hc_byte_perm (w7[0], w6[3], selector);
      c5[0] = hc_byte_perm (w6[3], w6[2], selector);
      c4[3] = hc_byte_perm (w6[2], w6[1], selector);
      c4[2] = hc_byte_perm (w6[1], w6[0], selector);
      c4[1] = hc_byte_perm (w6[0], w5[3], selector);
      c4[0] = hc_byte_perm (w5[3], w5[2], selector);
      c3[3] = hc_byte_perm (w5[2], w5[1], selector);
      c3[2] = hc_byte_perm (w5[1], w5[0], selector);
      c3[1] = hc_byte_perm (w5[0], w4[3], selector);
      c3[0] = hc_byte_perm (w4[3], w4[2], selector);
      c2[3] = hc_byte_perm (w4[2], w4[1], selector);
      c2[2] = hc_byte_perm (w4[1], w4[0], selector);
      c2[1] = hc_byte_perm (w4[0], w3[3], selector);
      c2[0] = hc_byte_perm (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm (w1[3], w1[2], selector);
      w7[3] = hc_byte_perm (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_byte_perm (    0, w7[3], selector);
      c6[1] = hc_byte_perm (w7[3], w7[2], selector);
      c6[0] = hc_byte_perm (w7[2], w7[1], selector);
      c5[3] = hc_byte_perm (w7[1], w7[0], selector);
      c5[2] = hc_byte_perm (w7[0], w6[3], selector);
      c5[1] = hc_byte_perm (w6[3], w6[2], selector);
      c5[0] = hc_byte_perm (w6[2], w6[1], selector);
      c4[3] = hc_byte_perm (w6[1], w6[0], selector);
      c4[2] = hc_byte_perm (w6[0], w5[3], selector);
      c4[1] = hc_byte_perm (w5[3], w5[2], selector);
      c4[0] = hc_byte_perm (w5[2], w5[1], selector);
      c3[3] = hc_byte_perm (w5[1], w5[0], selector);
      c3[2] = hc_byte_perm (w5[0], w4[3], selector);
      c3[1] = hc_byte_perm (w4[3], w4[2], selector);
      c3[0] = hc_byte_perm (w4[2], w4[1], selector);
      c2[3] = hc_byte_perm (w4[1], w4[0], selector);
      c2[2] = hc_byte_perm (w4[0], w3[3], selector);
      c2[1] = hc_byte_perm (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm (w1[2], w1[1], selector);
      w7[3] = hc_byte_perm (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_byte_perm (    0, w7[3], selector);
      c6[2] = hc_byte_perm (w7[3], w7[2], selector);
      c6[1] = hc_byte_perm (w7[2], w7[1], selector);
      c6[0] = hc_byte_perm (w7[1], w7[0], selector);
      c5[3] = hc_byte_perm (w7[0], w6[3], selector);
      c5[2] = hc_byte_perm (w6[3], w6[2], selector);
      c5[1] = hc_byte_perm (w6[2], w6[1], selector);
      c5[0] = hc_byte_perm (w6[1], w6[0], selector);
      c4[3] = hc_byte_perm (w6[0], w5[3], selector);
      c4[2] = hc_byte_perm (w5[3], w5[2], selector);
      c4[1] = hc_byte_perm (w5[2], w5[1], selector);
      c4[0] = hc_byte_perm (w5[1], w5[0], selector);
      c3[3] = hc_byte_perm (w5[0], w4[3], selector);
      c3[2] = hc_byte_perm (w4[3], w4[2], selector);
      c3[1] = hc_byte_perm (w4[2], w4[1], selector);
      c3[0] = hc_byte_perm (w4[1], w4[0], selector);
      c2[3] = hc_byte_perm (w4[0], w3[3], selector);
      c2[2] = hc_byte_perm (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm (w1[1], w1[0], selector);
      w7[3] = hc_byte_perm (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_byte_perm (    0, w7[3], selector);
      c6[3] = hc_byte_perm (w7[3], w7[2], selector);
      c6[2] = hc_byte_perm (w7[2], w7[1], selector);
      c6[1] = hc_byte_perm (w7[1], w7[0], selector);
      c6[0] = hc_byte_perm (w7[0], w6[3], selector);
      c5[3] = hc_byte_perm (w6[3], w6[2], selector);
      c5[2] = hc_byte_perm (w6[2], w6[1], selector);
      c5[1] = hc_byte_perm (w6[1], w6[0], selector);
      c5[0] = hc_byte_perm (w6[0], w5[3], selector);
      c4[3] = hc_byte_perm (w5[3], w5[2], selector);
      c4[2] = hc_byte_perm (w5[2], w5[1], selector);
      c4[1] = hc_byte_perm (w5[1], w5[0], selector);
      c4[0] = hc_byte_perm (w5[0], w4[3], selector);
      c3[3] = hc_byte_perm (w4[3], w4[2], selector);
      c3[2] = hc_byte_perm (w4[2], w4[1], selector);
      c3[1] = hc_byte_perm (w4[1], w4[0], selector);
      c3[0] = hc_byte_perm (w4[0], w3[3], selector);
      c2[3] = hc_byte_perm (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm (w1[0], w0[3], selector);
      w7[3] = hc_byte_perm (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_byte_perm (    0, w7[3], selector);
      c7[0] = hc_byte_perm (w7[3], w7[2], selector);
      c6[3] = hc_byte_perm (w7[2], w7[1], selector);
      c6[2] = hc_byte_perm (w7[1], w7[0], selector);
      c6[1] = hc_byte_perm (w7[0], w6[3], selector);
      c6[0] = hc_byte_perm (w6[3], w6[2], selector);
      c5[3] = hc_byte_perm (w6[2], w6[1], selector);
      c5[2] = hc_byte_perm (w6[1], w6[0], selector);
      c5[1] = hc_byte_perm (w6[0], w5[3], selector);
      c5[0] = hc_byte_perm (w5[3], w5[2], selector);
      c4[3] = hc_byte_perm (w5[2], w5[1], selector);
      c4[2] = hc_byte_perm (w5[1], w5[0], selector);
      c4[1] = hc_byte_perm (w5[0], w4[3], selector);
      c4[0] = hc_byte_perm (w4[3], w4[2], selector);
      c3[3] = hc_byte_perm (w4[2], w4[1], selector);
      c3[2] = hc_byte_perm (w4[1], w4[0], selector);
      c3[1] = hc_byte_perm (w4[0], w3[3], selector);
      c3[0] = hc_byte_perm (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm (w0[3], w0[2], selector);
      w7[3] = hc_byte_perm (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_byte_perm (    0, w7[3], selector);
      c7[1] = hc_byte_perm (w7[3], w7[2], selector);
      c7[0] = hc_byte_perm (w7[2], w7[1], selector);
      c6[3] = hc_byte_perm (w7[1], w7[0], selector);
      c6[2] = hc_byte_perm (w7[0], w6[3], selector);
      c6[1] = hc_byte_perm (w6[3], w6[2], selector);
      c6[0] = hc_byte_perm (w6[2], w6[1], selector);
      c5[3] = hc_byte_perm (w6[1], w6[0], selector);
      c5[2] = hc_byte_perm (w6[0], w5[3], selector);
      c5[1] = hc_byte_perm (w5[3], w5[2], selector);
      c5[0] = hc_byte_perm (w5[2], w5[1], selector);
      c4[3] = hc_byte_perm (w5[1], w5[0], selector);
      c4[2] = hc_byte_perm (w5[0], w4[3], selector);
      c4[1] = hc_byte_perm (w4[3], w4[2], selector);
      c4[0] = hc_byte_perm (w4[2], w4[1], selector);
      c3[3] = hc_byte_perm (w4[1], w4[0], selector);
      c3[2] = hc_byte_perm (w4[0], w3[3], selector);
      c3[1] = hc_byte_perm (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm (w0[2], w0[1], selector);
      w7[3] = hc_byte_perm (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_byte_perm (    0, w7[3], selector);
      c7[2] = hc_byte_perm (w7[3], w7[2], selector);
      c7[1] = hc_byte_perm (w7[2], w7[1], selector);
      c7[0] = hc_byte_perm (w7[1], w7[0], selector);
      c6[3] = hc_byte_perm (w7[0], w6[3], selector);
      c6[2] = hc_byte_perm (w6[3], w6[2], selector);
      c6[1] = hc_byte_perm (w6[2], w6[1], selector);
      c6[0] = hc_byte_perm (w6[1], w6[0], selector);
      c5[3] = hc_byte_perm (w6[0], w5[3], selector);
      c5[2] = hc_byte_perm (w5[3], w5[2], selector);
      c5[1] = hc_byte_perm (w5[2], w5[1], selector);
      c5[0] = hc_byte_perm (w5[1], w5[0], selector);
      c4[3] = hc_byte_perm (w5[0], w4[3], selector);
      c4[2] = hc_byte_perm (w4[3], w4[2], selector);
      c4[1] = hc_byte_perm (w4[2], w4[1], selector);
      c4[0] = hc_byte_perm (w4[1], w4[0], selector);
      c3[3] = hc_byte_perm (w4[0], w3[3], selector);
      c3[2] = hc_byte_perm (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm (w0[1], w0[0], selector);
      w7[3] = hc_byte_perm (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC

  #pragma unroll
  for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]);

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign (w[62], w[63], offset);
      w[62] = hc_bytealign (w[61], w[62], offset);
      w[61] = hc_bytealign (w[60], w[61], offset);
      w[60] = hc_bytealign (w[59], w[60], offset);
      w[59] = hc_bytealign (w[58], w[59], offset);
      w[58] = hc_bytealign (w[57], w[58], offset);
      w[57] = hc_bytealign (w[56], w[57], offset);
      w[56] = hc_bytealign (w[55], w[56], offset);
      w[55] = hc_bytealign (w[54], w[55], offset);
      w[54] = hc_bytealign (w[53], w[54], offset);
      w[53] = hc_bytealign (w[52], w[53], offset);
      w[52] = hc_bytealign (w[51], w[52], offset);
      w[51] = hc_bytealign (w[50], w[51], offset);
      w[50] = hc_bytealign (w[49], w[50], offset);
      w[49] = hc_bytealign (w[48], w[49], offset);
      w[48] = hc_bytealign (w[47], w[48], offset);
      w[47] = hc_bytealign (w[46], w[47], offset);
      w[46] = hc_bytealign (w[45], w[46], offset);
      w[45] = hc_bytealign (w[44], w[45], offset);
      w[44] = hc_bytealign (w[43], w[44], offset);
      w[43] = hc_bytealign (w[42], w[43], offset);
      w[42] = hc_bytealign (w[41], w[42], offset);
      w[41] = hc_bytealign (w[40], w[41], offset);
      w[40] = hc_bytealign (w[39], w[40], offset);
      w[39] = hc_bytealign (w[38], w[39], offset);
      w[38] = hc_bytealign (w[37], w[38], offset);
      w[37] = hc_bytealign (w[36], w[37], offset);
      w[36] = hc_bytealign (w[35], w[36], offset);
      w[35] = hc_bytealign (w[34], w[35], offset);
      w[34] = hc_bytealign (w[33], w[34], offset);
      w[33] = hc_bytealign (w[32], w[33], offset);
      w[32] = hc_bytealign (w[31], w[32], offset);
      w[31] = hc_bytealign (w[30], w[31], offset);
      w[30] = hc_bytealign (w[29], w[30], offset);
      w[29] = hc_bytealign (w[28], w[29], offset);
      w[28] = hc_bytealign (w[27], w[28], offset);
      w[27] = hc_bytealign (w[26], w[27], offset);
      w[26] = hc_bytealign (w[25], w[26], offset);
      w[25] = hc_bytealign (w[24], w[25], offset);
      w[24] = hc_bytealign (w[23], w[24], offset);
      w[23] = hc_bytealign (w[22], w[23], offset);
      w[22] = hc_bytealign (w[21], w[22], offset);
      w[21] = hc_bytealign (w[20], w[21], offset);
      w[20] = hc_bytealign (w[19], w[20], offset);
      w[19] = hc_bytealign (w[18], w[19], offset);
      w[18] = hc_bytealign (w[17], w[18], offset);
      w[17] = hc_bytealign (w[16], w[17], offset);
      w[16] = hc_bytealign (w[15], w[16], offset);
      w[15] = hc_bytealign (w[14], w[15], offset);
      w[14] = hc_bytealign (w[13], w[14], offset);
      w[13] = hc_bytealign (w[12], w[13], offset);
      w[12] = hc_bytealign (w[11], w[12], offset);
      w[11] = hc_bytealign (w[10], w[11], offset);
      w[10] = hc_bytealign (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign (w[61], w[62], offset);
      w[62] = hc_bytealign (w[60], w[61], offset);
      w[61] = hc_bytealign (w[59], w[60], offset);
      w[60] = hc_bytealign (w[58], w[59], offset);
      w[59] = hc_bytealign (w[57], w[58], offset);
      w[58] = hc_bytealign (w[56], w[57], offset);
      w[57] = hc_bytealign (w[55], w[56], offset);
      w[56] = hc_bytealign (w[54], w[55], offset);
      w[55] = hc_bytealign (w[53], w[54], offset);
      w[54] = hc_bytealign (w[52], w[53], offset);
      w[53] = hc_bytealign (w[51], w[52], offset);
      w[52] = hc_bytealign (w[50], w[51], offset);
      w[51] = hc_bytealign (w[49], w[50], offset);
      w[50] = hc_bytealign (w[48], w[49], offset);
      w[49] = hc_bytealign (w[47], w[48], offset);
      w[48] = hc_bytealign (w[46], w[47], offset);
      w[47] = hc_bytealign (w[45], w[46], offset);
      w[46] = hc_bytealign (w[44], w[45], offset);
      w[45] = hc_bytealign (w[43], w[44], offset);
      w[44] = hc_bytealign (w[42], w[43], offset);
      w[43] = hc_bytealign (w[41], w[42], offset);
      w[42] = hc_bytealign (w[40], w[41], offset);
      w[41] = hc_bytealign (w[39], w[40], offset);
      w[40] = hc_bytealign (w[38], w[39], offset);
      w[39] = hc_bytealign (w[37], w[38], offset);
      w[38] = hc_bytealign (w[36], w[37], offset);
      w[37] = hc_bytealign (w[35], w[36], offset);
      w[36] = hc_bytealign (w[34], w[35], offset);
      w[35] = hc_bytealign (w[33], w[34], offset);
      w[34] = hc_bytealign (w[32], w[33], offset);
      w[33] = hc_bytealign (w[31], w[32], offset);
      w[32] = hc_bytealign (w[30], w[31], offset);
      w[31] = hc_bytealign (w[29], w[30], offset);
      w[30] = hc_bytealign (w[28], w[29], offset);
      w[29] = hc_bytealign (w[27], w[28], offset);
      w[28] = hc_bytealign (w[26], w[27], offset);
      w[27] = hc_bytealign (w[25], w[26], offset);
      w[26] = hc_bytealign (w[24], w[25], offset);
      w[25] = hc_bytealign (w[23], w[24], offset);
      w[24] = hc_bytealign (w[22], w[23], offset);
      w[23] = hc_bytealign (w[21], w[22], offset);
      w[22] = hc_bytealign (w[20], w[21], offset);
      w[21] = hc_bytealign (w[19], w[20], offset);
      w[20] = hc_bytealign (w[18], w[19], offset);
      w[19] = hc_bytealign (w[17], w[18], offset);
      w[18] = hc_bytealign (w[16], w[17], offset);
      w[17] = hc_bytealign (w[15], w[16], offset);
      w[16] = hc_bytealign (w[14], w[15], offset);
      w[15] = hc_bytealign (w[13], w[14], offset);
      w[14] = hc_bytealign (w[12], w[13], offset);
      w[13] = hc_bytealign (w[11], w[12], offset);
      w[12] = hc_bytealign (w[10], w[11], offset);
      w[11] = hc_bytealign (w[ 9], w[10], offset);
      w[10] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign (w[60], w[61], offset);
      w[62] = hc_bytealign (w[59], w[60], offset);
      w[61] = hc_bytealign (w[58], w[59], offset);
      w[60] = hc_bytealign (w[57], w[58], offset);
      w[59] = hc_bytealign (w[56], w[57], offset);
      w[58] = hc_bytealign (w[55], w[56], offset);
      w[57] = hc_bytealign (w[54], w[55], offset);
      w[56] = hc_bytealign (w[53], w[54], offset);
      w[55] = hc_bytealign (w[52], w[53], offset);
      w[54] = hc_bytealign (w[51], w[52], offset);
      w[53] = hc_bytealign (w[50], w[51], offset);
      w[52] = hc_bytealign (w[49], w[50], offset);
      w[51] = hc_bytealign (w[48], w[49], offset);
      w[50] = hc_bytealign (w[47], w[48], offset);
      w[49] = hc_bytealign (w[46], w[47], offset);
      w[48] = hc_bytealign (w[45], w[46], offset);
      w[47] = hc_bytealign (w[44], w[45], offset);
      w[46] = hc_bytealign (w[43], w[44], offset);
      w[45] = hc_bytealign (w[42], w[43], offset);
      w[44] = hc_bytealign (w[41], w[42], offset);
      w[43] = hc_bytealign (w[40], w[41], offset);
      w[42] = hc_bytealign (w[39], w[40], offset);
      w[41] = hc_bytealign (w[38], w[39], offset);
      w[40] = hc_bytealign (w[37], w[38], offset);
      w[39] = hc_bytealign (w[36], w[37], offset);
      w[38] = hc_bytealign (w[35], w[36], offset);
      w[37] = hc_bytealign (w[34], w[35], offset);
      w[36] = hc_bytealign (w[33], w[34], offset);
      w[35] = hc_bytealign (w[32], w[33], offset);
      w[34] = hc_bytealign (w[31], w[32], offset);
      w[33] = hc_bytealign (w[30], w[31], offset);
      w[32] = hc_bytealign (w[29], w[30], offset);
      w[31] = hc_bytealign (w[28], w[29], offset);
      w[30] = hc_bytealign (w[27], w[28], offset);
      w[29] = hc_bytealign (w[26], w[27], offset);
      w[28] = hc_bytealign (w[25], w[26], offset);
      w[27] = hc_bytealign (w[24], w[25], offset);
      w[26] = hc_bytealign (w[23], w[24], offset);
      w[25] = hc_bytealign (w[22], w[23], offset);
      w[24] = hc_bytealign (w[21], w[22], offset);
      w[23] = hc_bytealign (w[20], w[21], offset);
      w[22] = hc_bytealign (w[19], w[20], offset);
      w[21] = hc_bytealign (w[18], w[19], offset);
      w[20] = hc_bytealign (w[17], w[18], offset);
      w[19] = hc_bytealign (w[16], w[17], offset);
      w[18] = hc_bytealign (w[15], w[16], offset);
      w[17] = hc_bytealign (w[14], w[15], offset);
      w[16] = hc_bytealign (w[13], w[14], offset);
      w[15] = hc_bytealign (w[12], w[13], offset);
      w[14] = hc_bytealign (w[11], w[12], offset);
      w[13] = hc_bytealign (w[10], w[11], offset);
      w[12] = hc_bytealign (w[ 9], w[10], offset);
      w[11] = hc_bytealign (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign (w[59], w[60], offset);
      w[62] = hc_bytealign (w[58], w[59], offset);
      w[61] = hc_bytealign (w[57], w[58], offset);
      w[60] = hc_bytealign (w[56], w[57], offset);
      w[59] = hc_bytealign (w[55], w[56], offset);
      w[58] = hc_bytealign (w[54], w[55], offset);
      w[57] = hc_bytealign (w[53], w[54], offset);
      w[56] = hc_bytealign (w[52], w[53], offset);
      w[55] = hc_bytealign (w[51], w[52], offset);
      w[54] = hc_bytealign (w[50], w[51], offset);
      w[53] = hc_bytealign (w[49], w[50], offset);
      w[52] = hc_bytealign (w[48], w[49], offset);
      w[51] = hc_bytealign (w[47], w[48], offset);
      w[50] = hc_bytealign (w[46], w[47], offset);
      w[49] = hc_bytealign (w[45], w[46], offset);
      w[48] = hc_bytealign (w[44], w[45], offset);
      w[47] = hc_bytealign (w[43], w[44], offset);
      w[46] = hc_bytealign (w[42], w[43], offset);
      w[45] = hc_bytealign (w[41], w[42], offset);
      w[44] = hc_bytealign (w[40], w[41], offset);
      w[43] = hc_bytealign (w[39], w[40], offset);
      w[42] = hc_bytealign (w[38], w[39], offset);
      w[41] = hc_bytealign (w[37], w[38], offset);
      w[40] = hc_bytealign (w[36], w[37], offset);
      w[39] = hc_bytealign (w[35], w[36], offset);
      w[38] = hc_bytealign (w[34], w[35], offset);
      w[37] = hc_bytealign (w[33], w[34], offset);
      w[36] = hc_bytealign (w[32], w[33], offset);
      w[35] = hc_bytealign (w[31], w[32], offset);
      w[34] = hc_bytealign (w[30], w[31], offset);
      w[33] = hc_bytealign (w[29], w[30], offset);
      w[32] = hc_bytealign (w[28], w[29], offset);
      w[31] = hc_bytealign (w[27], w[28], offset);
      w[30] = hc_bytealign (w[26], w[27], offset);
      w[29] = hc_bytealign (w[25], w[26], offset);
      w[28] = hc_bytealign (w[24], w[25], offset);
      w[27] = hc_bytealign (w[23], w[24], offset);
      w[26] = hc_bytealign (w[22], w[23], offset);
      w[25] = hc_bytealign (w[21], w[22], offset);
      w[24] = hc_bytealign (w[20], w[21], offset);
      w[23] = hc_bytealign (w[19], w[20], offset);
      w[22] = hc_bytealign (w[18], w[19], offset);
      w[21] = hc_bytealign (w[17], w[18], offset);
      w[20] = hc_bytealign (w[16], w[17], offset);
      w[19] = hc_bytealign (w[15], w[16], offset);
      w[18] = hc_bytealign (w[14], w[15], offset);
      w[17] = hc_bytealign (w[13], w[14], offset);
      w[16] = hc_bytealign (w[12], w[13], offset);
      w[15] = hc_bytealign (w[11], w[12], offset);
      w[14] = hc_bytealign (w[10], w[11], offset);
      w[13] = hc_bytealign (w[ 9], w[10], offset);
      w[12] = hc_bytealign (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign (w[58], w[59], offset);
      w[62] = hc_bytealign (w[57], w[58], offset);
      w[61] = hc_bytealign (w[56], w[57], offset);
      w[60] = hc_bytealign (w[55], w[56], offset);
      w[59] = hc_bytealign (w[54], w[55], offset);
      w[58] = hc_bytealign (w[53], w[54], offset);
      w[57] = hc_bytealign (w[52], w[53], offset);
      w[56] = hc_bytealign (w[51], w[52], offset);
      w[55] = hc_bytealign (w[50], w[51], offset);
      w[54] = hc_bytealign (w[49], w[50], offset);
      w[53] = hc_bytealign (w[48], w[49], offset);
      w[52] = hc_bytealign (w[47], w[48], offset);
      w[51] = hc_bytealign (w[46], w[47], offset);
      w[50] = hc_bytealign (w[45], w[46], offset);
      w[49] = hc_bytealign (w[44], w[45], offset);
      w[48] = hc_bytealign (w[43], w[44], offset);
      w[47] = hc_bytealign (w[42], w[43], offset);
      w[46] = hc_bytealign (w[41], w[42], offset);
      w[45] = hc_bytealign (w[40], w[41], offset);
      w[44] = hc_bytealign (w[39], w[40], offset);
      w[43] = hc_bytealign (w[38], w[39], offset);
      w[42] = hc_bytealign (w[37], w[38], offset);
      w[41] = hc_bytealign (w[36], w[37], offset);
      w[40] = hc_bytealign (w[35], w[36], offset);
      w[39] = hc_bytealign (w[34], w[35], offset);
      w[38] = hc_bytealign (w[33], w[34], offset);
      w[37] = hc_bytealign (w[32], w[33], offset);
      w[36] = hc_bytealign (w[31], w[32], offset);
      w[35] = hc_bytealign (w[30], w[31], offset);
      w[34] = hc_bytealign (w[29], w[30], offset);
      w[33] = hc_bytealign (w[28], w[29], offset);
      w[32] = hc_bytealign (w[27], w[28], offset);
      w[31] = hc_bytealign (w[26], w[27], offset);
      w[30] = hc_bytealign (w[25], w[26], offset);
      w[29] = hc_bytealign (w[24], w[25], offset);
      w[28] = hc_bytealign (w[23], w[24], offset);
      w[27] = hc_bytealign (w[22], w[23], offset);
      w[26] = hc_bytealign (w[21], w[22], offset);
      w[25] = hc_bytealign (w[20], w[21], offset);
      w[24] = hc_bytealign (w[19], w[20], offset);
      w[23] = hc_bytealign (w[18], w[19], offset);
      w[22] = hc_bytealign (w[17], w[18], offset);
      w[21] = hc_bytealign (w[16], w[17], offset);
      w[20] = hc_bytealign (w[15], w[16], offset);
      w[19] = hc_bytealign (w[14], w[15], offset);
      w[18] = hc_bytealign (w[13], w[14], offset);
      w[17] = hc_bytealign (w[12], w[13], offset);
      w[16] = hc_bytealign (w[11], w[12], offset);
      w[15] = hc_bytealign (w[10], w[11], offset);
      w[14] = hc_bytealign (w[ 9], w[10], offset);
      w[13] = hc_bytealign (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign (w[57], w[58], offset);
      w[62] = hc_bytealign (w[56], w[57], offset);
      w[61] = hc_bytealign (w[55], w[56], offset);
      w[60] = hc_bytealign (w[54], w[55], offset);
      w[59] = hc_bytealign (w[53], w[54], offset);
      w[58] = hc_bytealign (w[52], w[53], offset);
      w[57] = hc_bytealign (w[51], w[52], offset);
      w[56] = hc_bytealign (w[50], w[51], offset);
      w[55] = hc_bytealign (w[49], w[50], offset);
      w[54] = hc_bytealign (w[48], w[49], offset);
      w[53] = hc_bytealign (w[47], w[48], offset);
      w[52] = hc_bytealign (w[46], w[47], offset);
      w[51] = hc_bytealign (w[45], w[46], offset);
      w[50] = hc_bytealign (w[44], w[45], offset);
      w[49] = hc_bytealign (w[43], w[44], offset);
      w[48] = hc_bytealign (w[42], w[43], offset);
      w[47] = hc_bytealign (w[41], w[42], offset);
      w[46] = hc_bytealign (w[40], w[41], offset);
      w[45] = hc_bytealign (w[39], w[40], offset);
      w[44] = hc_bytealign (w[38], w[39], offset);
      w[43] = hc_bytealign (w[37], w[38], offset);
      w[42] = hc_bytealign (w[36], w[37], offset);
      w[41] = hc_bytealign (w[35], w[36], offset);
      w[40] = hc_bytealign (w[34], w[35], offset);
      w[39] = hc_bytealign (w[33], w[34], offset);
      w[38] = hc_bytealign (w[32], w[33], offset);
      w[37] = hc_bytealign (w[31], w[32], offset);
      w[36] = hc_bytealign (w[30], w[31], offset);
      w[35] = hc_bytealign (w[29], w[30], offset);
      w[34] = hc_bytealign (w[28], w[29], offset);
      w[33] = hc_bytealign (w[27], w[28], offset);
      w[32] = hc_bytealign (w[26], w[27], offset);
      w[31] = hc_bytealign (w[25], w[26], offset);
      w[30] = hc_bytealign (w[24], w[25], offset);
      w[29] = hc_bytealign (w[23], w[24], offset);
      w[28] = hc_bytealign (w[22], w[23], offset);
      w[27] = hc_bytealign (w[21], w[22], offset);
      w[26] = hc_bytealign (w[20], w[21], offset);
      w[25] = hc_bytealign (w[19], w[20], offset);
      w[24] = hc_bytealign (w[18], w[19], offset);
      w[23] = hc_bytealign (w[17], w[18], offset);
      w[22] = hc_bytealign (w[16], w[17], offset);
      w[21] = hc_bytealign (w[15], w[16], offset);
      w[20] = hc_bytealign (w[14], w[15], offset);
      w[19] = hc_bytealign (w[13], w[14], offset);
      w[18] = hc_bytealign (w[12], w[13], offset);
      w[17] = hc_bytealign (w[11], w[12], offset);
      w[16] = hc_bytealign (w[10], w[11], offset);
      w[15] = hc_bytealign (w[ 9], w[10], offset);
      w[14] = hc_bytealign (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign (w[56], w[57], offset);
      w[62] = hc_bytealign (w[55], w[56], offset);
      w[61] = hc_bytealign (w[54], w[55], offset);
      w[60] = hc_bytealign (w[53], w[54], offset);
      w[59] = hc_bytealign (w[52], w[53], offset);
      w[58] = hc_bytealign (w[51], w[52], offset);
      w[57] = hc_bytealign (w[50], w[51], offset);
      w[56] = hc_bytealign (w[49], w[50], offset);
      w[55] = hc_bytealign (w[48], w[49], offset);
      w[54] = hc_bytealign (w[47], w[48], offset);
      w[53] = hc_bytealign (w[46], w[47], offset);
      w[52] = hc_bytealign (w[45], w[46], offset);
      w[51] = hc_bytealign (w[44], w[45], offset);
      w[50] = hc_bytealign (w[43], w[44], offset);
      w[49] = hc_bytealign (w[42], w[43], offset);
      w[48] = hc_bytealign (w[41], w[42], offset);
      w[47] = hc_bytealign (w[40], w[41], offset);
      w[46] = hc_bytealign (w[39], w[40], offset);
      w[45] = hc_bytealign (w[38], w[39], offset);
      w[44] = hc_bytealign (w[37], w[38], offset);
      w[43] = hc_bytealign (w[36], w[37], offset);
      w[42] = hc_bytealign (w[35], w[36], offset);
      w[41] = hc_bytealign (w[34], w[35], offset);
      w[40] = hc_bytealign (w[33], w[34], offset);
      w[39] = hc_bytealign (w[32], w[33], offset);
      w[38] = hc_bytealign (w[31], w[32], offset);
      w[37] = hc_bytealign (w[30], w[31], offset);
      w[36] = hc_bytealign (w[29], w[30], offset);
      w[35] = hc_bytealign (w[28], w[29], offset);
      w[34] = hc_bytealign (w[27], w[28], offset);
      w[33] = hc_bytealign (w[26], w[27], offset);
      w[32] = hc_bytealign (w[25], w[26], offset);
      w[31] = hc_bytealign (w[24], w[25], offset);
      w[30] = hc_bytealign (w[23], w[24], offset);
      w[29] = hc_bytealign (w[22], w[23], offset);
      w[28] = hc_bytealign (w[21], w[22], offset);
      w[27] = hc_bytealign (w[20], w[21], offset);
      w[26] = hc_bytealign (w[19], w[20], offset);
      w[25] = hc_bytealign (w[18], w[19], offset);
      w[24] = hc_bytealign (w[17], w[18], offset);
      w[23] = hc_bytealign (w[16], w[17], offset);
      w[22] = hc_bytealign (w[15], w[16], offset);
      w[21] = hc_bytealign (w[14], w[15], offset);
      w[20] = hc_bytealign (w[13], w[14], offset);
      w[19] = hc_bytealign (w[12], w[13], offset);
      w[18] = hc_bytealign (w[11], w[12], offset);
      w[17] = hc_bytealign (w[10], w[11], offset);
      w[16] = hc_bytealign (w[ 9], w[10], offset);
      w[15] = hc_bytealign (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign (w[55], w[56], offset);
      w[62] = hc_bytealign (w[54], w[55], offset);
      w[61] = hc_bytealign (w[53], w[54], offset);
      w[60] = hc_bytealign (w[52], w[53], offset);
      w[59] = hc_bytealign (w[51], w[52], offset);
      w[58] = hc_bytealign (w[50], w[51], offset);
      w[57] = hc_bytealign (w[49], w[50], offset);
      w[56] = hc_bytealign (w[48], w[49], offset);
      w[55] = hc_bytealign (w[47], w[48], offset);
      w[54] = hc_bytealign (w[46], w[47], offset);
      w[53] = hc_bytealign (w[45], w[46], offset);
      w[52] = hc_bytealign (w[44], w[45], offset);
      w[51] = hc_bytealign (w[43], w[44], offset);
      w[50] = hc_bytealign (w[42], w[43], offset);
      w[49] = hc_bytealign (w[41], w[42], offset);
      w[48] = hc_bytealign (w[40], w[41], offset);
      w[47] = hc_bytealign (w[39], w[40], offset);
      w[46] = hc_bytealign (w[38], w[39], offset);
      w[45] = hc_bytealign (w[37], w[38], offset);
      w[44] = hc_bytealign (w[36], w[37], offset);
      w[43] = hc_bytealign (w[35], w[36], offset);
      w[42] = hc_bytealign (w[34], w[35], offset);
      w[41] = hc_bytealign (w[33], w[34], offset);
      w[40] = hc_bytealign (w[32], w[33], offset);
      w[39] = hc_bytealign (w[31], w[32], offset);
      w[38] = hc_bytealign (w[30], w[31], offset);
      w[37] = hc_bytealign (w[29], w[30], offset);
      w[36] = hc_bytealign (w[28], w[29], offset);
      w[35] = hc_bytealign (w[27], w[28], offset);
      w[34] = hc_bytealign (w[26], w[27], offset);
      w[33] = hc_bytealign (w[25], w[26], offset);
      w[32] = hc_bytealign (w[24], w[25], offset);
      w[31] = hc_bytealign (w[23], w[24], offset);
      w[30] = hc_bytealign (w[22], w[23], offset);
      w[29] = hc_bytealign (w[21], w[22], offset);
      w[28] = hc_bytealign (w[20], w[21], offset);
      w[27] = hc_bytealign (w[19], w[20], offset);
      w[26] = hc_bytealign (w[18], w[19], offset);
      w[25] = hc_bytealign (w[17], w[18], offset);
      w[24] = hc_bytealign (w[16], w[17], offset);
      w[23] = hc_bytealign (w[15], w[16], offset);
      w[22] = hc_bytealign (w[14], w[15], offset);
      w[21] = hc_bytealign (w[13], w[14], offset);
      w[20] = hc_bytealign (w[12], w[13], offset);
      w[19] = hc_bytealign (w[11], w[12], offset);
      w[18] = hc_bytealign (w[10], w[11], offset);
      w[17] = hc_bytealign (w[ 9], w[10], offset);
      w[16] = hc_bytealign (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign (w[54], w[55], offset);
      w[62] = hc_bytealign (w[53], w[54], offset);
      w[61] = hc_bytealign (w[52], w[53], offset);
      w[60] = hc_bytealign (w[51], w[52], offset);
      w[59] = hc_bytealign (w[50], w[51], offset);
      w[58] = hc_bytealign (w[49], w[50], offset);
      w[57] = hc_bytealign (w[48], w[49], offset);
      w[56] = hc_bytealign (w[47], w[48], offset);
      w[55] = hc_bytealign (w[46], w[47], offset);
      w[54] = hc_bytealign (w[45], w[46], offset);
      w[53] = hc_bytealign (w[44], w[45], offset);
      w[52] = hc_bytealign (w[43], w[44], offset);
      w[51] = hc_bytealign (w[42], w[43], offset);
      w[50] = hc_bytealign (w[41], w[42], offset);
      w[49] = hc_bytealign (w[40], w[41], offset);
      w[48] = hc_bytealign (w[39], w[40], offset);
      w[47] = hc_bytealign (w[38], w[39], offset);
      w[46] = hc_bytealign (w[37], w[38], offset);
      w[45] = hc_bytealign (w[36], w[37], offset);
      w[44] = hc_bytealign (w[35], w[36], offset);
      w[43] = hc_bytealign (w[34], w[35], offset);
      w[42] = hc_bytealign (w[33], w[34], offset);
      w[41] = hc_bytealign (w[32], w[33], offset);
      w[40] = hc_bytealign (w[31], w[32], offset);
      w[39] = hc_bytealign (w[30], w[31], offset);
      w[38] = hc_bytealign (w[29], w[30], offset);
      w[37] = hc_bytealign (w[28], w[29], offset);
      w[36] = hc_bytealign (w[27], w[28], offset);
      w[35] = hc_bytealign (w[26], w[27], offset);
      w[34] = hc_bytealign (w[25], w[26], offset);
      w[33] = hc_bytealign (w[24], w[25], offset);
      w[32] = hc_bytealign (w[23], w[24], offset);
      w[31] = hc_bytealign (w[22], w[23], offset);
      w[30] = hc_bytealign (w[21], w[22], offset);
      w[29] = hc_bytealign (w[20], w[21], offset);
      w[28] = hc_bytealign (w[19], w[20], offset);
      w[27] = hc_bytealign (w[18], w[19], offset);
      w[26] = hc_bytealign (w[17], w[18], offset);
      w[25] = hc_bytealign (w[16], w[17], offset);
      w[24] = hc_bytealign (w[15], w[16], offset);
      w[23] = hc_bytealign (w[14], w[15], offset);
      w[22] = hc_bytealign (w[13], w[14], offset);
      w[21] = hc_bytealign (w[12], w[13], offset);
      w[20] = hc_bytealign (w[11], w[12], offset);
      w[19] = hc_bytealign (w[10], w[11], offset);
      w[18] = hc_bytealign (w[ 9], w[10], offset);
      w[17] = hc_bytealign (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign (w[53], w[54], offset);
      w[62] = hc_bytealign (w[52], w[53], offset);
      w[61] = hc_bytealign (w[51], w[52], offset);
      w[60] = hc_bytealign (w[50], w[51], offset);
      w[59] = hc_bytealign (w[49], w[50], offset);
      w[58] = hc_bytealign (w[48], w[49], offset);
      w[57] = hc_bytealign (w[47], w[48], offset);
      w[56] = hc_bytealign (w[46], w[47], offset);
      w[55] = hc_bytealign (w[45], w[46], offset);
      w[54] = hc_bytealign (w[44], w[45], offset);
      w[53] = hc_bytealign (w[43], w[44], offset);
      w[52] = hc_bytealign (w[42], w[43], offset);
      w[51] = hc_bytealign (w[41], w[42], offset);
      w[50] = hc_bytealign (w[40], w[41], offset);
      w[49] = hc_bytealign (w[39], w[40], offset);
      w[48] = hc_bytealign (w[38], w[39], offset);
      w[47] = hc_bytealign (w[37], w[38], offset);
      w[46] = hc_bytealign (w[36], w[37], offset);
      w[45] = hc_bytealign (w[35], w[36], offset);
      w[44] = hc_bytealign (w[34], w[35], offset);
      w[43] = hc_bytealign (w[33], w[34], offset);
      w[42] = hc_bytealign (w[32], w[33], offset);
      w[41] = hc_bytealign (w[31], w[32], offset);
      w[40] = hc_bytealign (w[30], w[31], offset);
      w[39] = hc_bytealign (w[29], w[30], offset);
      w[38] = hc_bytealign (w[28], w[29], offset);
      w[37] = hc_bytealign (w[27], w[28], offset);
      w[36] = hc_bytealign (w[26], w[27], offset);
      w[35] = hc_bytealign (w[25], w[26], offset);
      w[34] = hc_bytealign (w[24], w[25], offset);
      w[33] = hc_bytealign (w[23], w[24], offset);
      w[32] = hc_bytealign (w[22], w[23], offset);
      w[31] = hc_bytealign (w[21], w[22], offset);
      w[30] = hc_bytealign (w[20], w[21], offset);
      w[29] = hc_bytealign (w[19], w[20], offset);
      w[28] = hc_bytealign (w[18], w[19], offset);
      w[27] = hc_bytealign (w[17], w[18], offset);
      w[26] = hc_bytealign (w[16], w[17], offset);
      w[25] = hc_bytealign (w[15], w[16], offset);
      w[24] = hc_bytealign (w[14], w[15], offset);
      w[23] = hc_bytealign (w[13], w[14], offset);
      w[22] = hc_bytealign (w[12], w[13], offset);
      w[21] = hc_bytealign (w[11], w[12], offset);
      w[20] = hc_bytealign (w[10], w[11], offset);
      w[19] = hc_bytealign (w[ 9], w[10], offset);
      w[18] = hc_bytealign (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign (w[52], w[53], offset);
      w[62] = hc_bytealign (w[51], w[52], offset);
      w[61] = hc_bytealign (w[50], w[51], offset);
      w[60] = hc_bytealign (w[49], w[50], offset);
      w[59] = hc_bytealign (w[48], w[49], offset);
      w[58] = hc_bytealign (w[47], w[48], offset);
      w[57] = hc_bytealign (w[46], w[47], offset);
      w[56] = hc_bytealign (w[45], w[46], offset);
      w[55] = hc_bytealign (w[44], w[45], offset);
      w[54] = hc_bytealign (w[43], w[44], offset);
      w[53] = hc_bytealign (w[42], w[43], offset);
      w[52] = hc_bytealign (w[41], w[42], offset);
      w[51] = hc_bytealign (w[40], w[41], offset);
      w[50] = hc_bytealign (w[39], w[40], offset);
      w[49] = hc_bytealign (w[38], w[39], offset);
      w[48] = hc_bytealign (w[37], w[38], offset);
      w[47] = hc_bytealign (w[36], w[37], offset);
      w[46] = hc_bytealign (w[35], w[36], offset);
      w[45] = hc_bytealign (w[34], w[35], offset);
      w[44] = hc_bytealign (w[33], w[34], offset);
      w[43] = hc_bytealign (w[32], w[33], offset);
      w[42] = hc_bytealign (w[31], w[32], offset);
      w[41] = hc_bytealign (w[30], w[31], offset);
      w[40] = hc_bytealign (w[29], w[30], offset);
      w[39] = hc_bytealign (w[28], w[29], offset);
      w[38] = hc_bytealign (w[27], w[28], offset);
      w[37] = hc_bytealign (w[26], w[27], offset);
      w[36] = hc_bytealign (w[25], w[26], offset);
      w[35] = hc_bytealign (w[24], w[25], offset);
      w[34] = hc_bytealign (w[23], w[24], offset);
      w[33] = hc_bytealign (w[22], w[23], offset);
      w[32] = hc_bytealign (w[21], w[22], offset);
      w[31] = hc_bytealign (w[20], w[21], offset);
      w[30] = hc_bytealign (w[19], w[20], offset);
      w[29] = hc_bytealign (w[18], w[19], offset);
      w[28] = hc_bytealign (w[17], w[18], offset);
      w[27] = hc_bytealign (w[16], w[17], offset);
      w[26] = hc_bytealign (w[15], w[16], offset);
      w[25] = hc_bytealign (w[14], w[15], offset);
      w[24] = hc_bytealign (w[13], w[14], offset);
      w[23] = hc_bytealign (w[12], w[13], offset);
      w[22] = hc_bytealign (w[11], w[12], offset);
      w[21] = hc_bytealign (w[10], w[11], offset);
      w[20] = hc_bytealign (w[ 9], w[10], offset);
      w[19] = hc_bytealign (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign (w[51], w[52], offset);
      w[62] = hc_bytealign (w[50], w[51], offset);
      w[61] = hc_bytealign (w[49], w[50], offset);
      w[60] = hc_bytealign (w[48], w[49], offset);
      w[59] = hc_bytealign (w[47], w[48], offset);
      w[58] = hc_bytealign (w[46], w[47], offset);
      w[57] = hc_bytealign (w[45], w[46], offset);
      w[56] = hc_bytealign (w[44], w[45], offset);
      w[55] = hc_bytealign (w[43], w[44], offset);
      w[54] = hc_bytealign (w[42], w[43], offset);
      w[53] = hc_bytealign (w[41], w[42], offset);
      w[52] = hc_bytealign (w[40], w[41], offset);
      w[51] = hc_bytealign (w[39], w[40], offset);
      w[50] = hc_bytealign (w[38], w[39], offset);
      w[49] = hc_bytealign (w[37], w[38], offset);
      w[48] = hc_bytealign (w[36], w[37], offset);
      w[47] = hc_bytealign (w[35], w[36], offset);
      w[46] = hc_bytealign (w[34], w[35], offset);
      w[45] = hc_bytealign (w[33], w[34], offset);
      w[44] = hc_bytealign (w[32], w[33], offset);
      w[43] = hc_bytealign (w[31], w[32], offset);
      w[42] = hc_bytealign (w[30], w[31], offset);
      w[41] = hc_bytealign (w[29], w[30], offset);
      w[40] = hc_bytealign (w[28], w[29], offset);
      w[39] = hc_bytealign (w[27], w[28], offset);
      w[38] = hc_bytealign (w[26], w[27], offset);
      w[37] = hc_bytealign (w[25], w[26], offset);
      w[36] = hc_bytealign (w[24], w[25], offset);
      w[35] = hc_bytealign (w[23], w[24], offset);
      w[34] = hc_bytealign (w[22], w[23], offset);
      w[33] = hc_bytealign (w[21], w[22], offset);
      w[32] = hc_bytealign (w[20], w[21], offset);
      w[31] = hc_bytealign (w[19], w[20], offset);
      w[30] = hc_bytealign (w[18], w[19], offset);
      w[29] = hc_bytealign (w[17], w[18], offset);
      w[28] = hc_bytealign (w[16], w[17], offset);
      w[27] = hc_bytealign (w[15], w[16], offset);
      w[26] = hc_bytealign (w[14], w[15], offset);
      w[25] = hc_bytealign (w[13], w[14], offset);
      w[24] = hc_bytealign (w[12], w[13], offset);
      w[23] = hc_bytealign (w[11], w[12], offset);
      w[22] = hc_bytealign (w[10], w[11], offset);
      w[21] = hc_bytealign (w[ 9], w[10], offset);
      w[20] = hc_bytealign (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign (w[50], w[51], offset);
      w[62] = hc_bytealign (w[49], w[50], offset);
      w[61] = hc_bytealign (w[48], w[49], offset);
      w[60] = hc_bytealign (w[47], w[48], offset);
      w[59] = hc_bytealign (w[46], w[47], offset);
      w[58] = hc_bytealign (w[45], w[46], offset);
      w[57] = hc_bytealign (w[44], w[45], offset);
      w[56] = hc_bytealign (w[43], w[44], offset);
      w[55] = hc_bytealign (w[42], w[43], offset);
      w[54] = hc_bytealign (w[41], w[42], offset);
      w[53] = hc_bytealign (w[40], w[41], offset);
      w[52] = hc_bytealign (w[39], w[40], offset);
      w[51] = hc_bytealign (w[38], w[39], offset);
      w[50] = hc_bytealign (w[37], w[38], offset);
      w[49] = hc_bytealign (w[36], w[37], offset);
      w[48] = hc_bytealign (w[35], w[36], offset);
      w[47] = hc_bytealign (w[34], w[35], offset);
      w[46] = hc_bytealign (w[33], w[34], offset);
      w[45] = hc_bytealign (w[32], w[33], offset);
      w[44] = hc_bytealign (w[31], w[32], offset);
      w[43] = hc_bytealign (w[30], w[31], offset);
      w[42] = hc_bytealign (w[29], w[30], offset);
      w[41] = hc_bytealign (w[28], w[29], offset);
      w[40] = hc_bytealign (w[27], w[28], offset);
      w[39] = hc_bytealign (w[26], w[27], offset);
      w[38] = hc_bytealign (w[25], w[26], offset);
      w[37] = hc_bytealign (w[24], w[25], offset);
      w[36] = hc_bytealign (w[23], w[24], offset);
      w[35] = hc_bytealign (w[22], w[23], offset);
      w[34] = hc_bytealign (w[21], w[22], offset);
      w[33] = hc_bytealign (w[20], w[21], offset);
      w[32] = hc_bytealign (w[19], w[20], offset);
      w[31] = hc_bytealign (w[18], w[19], offset);
      w[30] = hc_bytealign (w[17], w[18], offset);
      w[29] = hc_bytealign (w[16], w[17], offset);
      w[28] = hc_bytealign (w[15], w[16], offset);
      w[27] = hc_bytealign (w[14], w[15], offset);
      w[26] = hc_bytealign (w[13], w[14], offset);
      w[25] = hc_bytealign (w[12], w[13], offset);
      w[24] = hc_bytealign (w[11], w[12], offset);
      w[23] = hc_bytealign (w[10], w[11], offset);
      w[22] = hc_bytealign (w[ 9], w[10], offset);
      w[21] = hc_bytealign (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign (w[49], w[50], offset);
      w[62] = hc_bytealign (w[48], w[49], offset);
      w[61] = hc_bytealign (w[47], w[48], offset);
      w[60] = hc_bytealign (w[46], w[47], offset);
      w[59] = hc_bytealign (w[45], w[46], offset);
      w[58] = hc_bytealign (w[44], w[45], offset);
      w[57] = hc_bytealign (w[43], w[44], offset);
      w[56] = hc_bytealign (w[42], w[43], offset);
      w[55] = hc_bytealign (w[41], w[42], offset);
      w[54] = hc_bytealign (w[40], w[41], offset);
      w[53] = hc_bytealign (w[39], w[40], offset);
      w[52] = hc_bytealign (w[38], w[39], offset);
      w[51] = hc_bytealign (w[37], w[38], offset);
      w[50] = hc_bytealign (w[36], w[37], offset);
      w[49] = hc_bytealign (w[35], w[36], offset);
      w[48] = hc_bytealign (w[34], w[35], offset);
      w[47] = hc_bytealign (w[33], w[34], offset);
      w[46] = hc_bytealign (w[32], w[33], offset);
      w[45] = hc_bytealign (w[31], w[32], offset);
      w[44] = hc_bytealign (w[30], w[31], offset);
      w[43] = hc_bytealign (w[29], w[30], offset);
      w[42] = hc_bytealign (w[28], w[29], offset);
      w[41] = hc_bytealign (w[27], w[28], offset);
      w[40] = hc_bytealign (w[26], w[27], offset);
      w[39] = hc_bytealign (w[25], w[26], offset);
      w[38] = hc_bytealign (w[24], w[25], offset);
      w[37] = hc_bytealign (w[23], w[24], offset);
      w[36] = hc_bytealign (w[22], w[23], offset);
      w[35] = hc_bytealign (w[21], w[22], offset);
      w[34] = hc_bytealign (w[20], w[21], offset);
      w[33] = hc_bytealign (w[19], w[20], offset);
      w[32] = hc_bytealign (w[18], w[19], offset);
      w[31] = hc_bytealign (w[17], w[18], offset);
      w[30] = hc_bytealign (w[16], w[17], offset);
      w[29] = hc_bytealign (w[15], w[16], offset);
      w[28] = hc_bytealign (w[14], w[15], offset);
      w[27] = hc_bytealign (w[13], w[14], offset);
      w[26] = hc_bytealign (w[12], w[13], offset);
      w[25] = hc_bytealign (w[11], w[12], offset);
      w[24] = hc_bytealign (w[10], w[11], offset);
      w[23] = hc_bytealign (w[ 9], w[10], offset);
      w[22] = hc_bytealign (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign (w[48], w[49], offset);
      w[62] = hc_bytealign (w[47], w[48], offset);
      w[61] = hc_bytealign (w[46], w[47], offset);
      w[60] = hc_bytealign (w[45], w[46], offset);
      w[59] = hc_bytealign (w[44], w[45], offset);
      w[58] = hc_bytealign (w[43], w[44], offset);
      w[57] = hc_bytealign (w[42], w[43], offset);
      w[56] = hc_bytealign (w[41], w[42], offset);
      w[55] = hc_bytealign (w[40], w[41], offset);
      w[54] = hc_bytealign (w[39], w[40], offset);
      w[53] = hc_bytealign (w[38], w[39], offset);
      w[52] = hc_bytealign (w[37], w[38], offset);
      w[51] = hc_bytealign (w[36], w[37], offset);
      w[50] = hc_bytealign (w[35], w[36], offset);
      w[49] = hc_bytealign (w[34], w[35], offset);
      w[48] = hc_bytealign (w[33], w[34], offset);
      w[47] = hc_bytealign (w[32], w[33], offset);
      w[46] = hc_bytealign (w[31], w[32], offset);
      w[45] = hc_bytealign (w[30], w[31], offset);
      w[44] = hc_bytealign (w[29], w[30], offset);
      w[43] = hc_bytealign (w[28], w[29], offset);
      w[42] = hc_bytealign (w[27], w[28], offset);
      w[41] = hc_bytealign (w[26], w[27], offset);
      w[40] = hc_bytealign (w[25], w[26], offset);
      w[39] = hc_bytealign (w[24], w[25], offset);
      w[38] = hc_bytealign (w[23], w[24], offset);
      w[37] = hc_bytealign (w[22], w[23], offset);
      w[36] = hc_bytealign (w[21], w[22], offset);
      w[35] = hc_bytealign (w[20], w[21], offset);
      w[34] = hc_bytealign (w[19], w[20], offset);
      w[33] = hc_bytealign (w[18], w[19], offset);
      w[32] = hc_bytealign (w[17], w[18], offset);
      w[31] = hc_bytealign (w[16], w[17], offset);
      w[30] = hc_bytealign (w[15], w[16], offset);
      w[29] = hc_bytealign (w[14], w[15], offset);
      w[28] = hc_bytealign (w[13], w[14], offset);
      w[27] = hc_bytealign (w[12], w[13], offset);
      w[26] = hc_bytealign (w[11], w[12], offset);
      w[25] = hc_bytealign (w[10], w[11], offset);
      w[24] = hc_bytealign (w[ 9], w[10], offset);
      w[23] = hc_bytealign (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign (w[47], w[48], offset);
      w[62] = hc_bytealign (w[46], w[47], offset);
      w[61] = hc_bytealign (w[45], w[46], offset);
      w[60] = hc_bytealign (w[44], w[45], offset);
      w[59] = hc_bytealign (w[43], w[44], offset);
      w[58] = hc_bytealign (w[42], w[43], offset);
      w[57] = hc_bytealign (w[41], w[42], offset);
      w[56] = hc_bytealign (w[40], w[41], offset);
      w[55] = hc_bytealign (w[39], w[40], offset);
      w[54] = hc_bytealign (w[38], w[39], offset);
      w[53] = hc_bytealign (w[37], w[38], offset);
      w[52] = hc_bytealign (w[36], w[37], offset);
      w[51] = hc_bytealign (w[35], w[36], offset);
      w[50] = hc_bytealign (w[34], w[35], offset);
      w[49] = hc_bytealign (w[33], w[34], offset);
      w[48] = hc_bytealign (w[32], w[33], offset);
      w[47] = hc_bytealign (w[31], w[32], offset);
      w[46] = hc_bytealign (w[30], w[31], offset);
      w[45] = hc_bytealign (w[29], w[30], offset);
      w[44] = hc_bytealign (w[28], w[29], offset);
      w[43] = hc_bytealign (w[27], w[28], offset);
      w[42] = hc_bytealign (w[26], w[27], offset);
      w[41] = hc_bytealign (w[25], w[26], offset);
      w[40] = hc_bytealign (w[24], w[25], offset);
      w[39] = hc_bytealign (w[23], w[24], offset);
      w[38] = hc_bytealign (w[22], w[23], offset);
      w[37] = hc_bytealign (w[21], w[22], offset);
      w[36] = hc_bytealign (w[20], w[21], offset);
      w[35] = hc_bytealign (w[19], w[20], offset);
      w[34] = hc_bytealign (w[18], w[19], offset);
      w[33] = hc_bytealign (w[17], w[18], offset);
      w[32] = hc_bytealign (w[16], w[17], offset);
      w[31] = hc_bytealign (w[15], w[16], offset);
      w[30] = hc_bytealign (w[14], w[15], offset);
      w[29] = hc_bytealign (w[13], w[14], offset);
      w[28] = hc_bytealign (w[12], w[13], offset);
      w[27] = hc_bytealign (w[11], w[12], offset);
      w[26] = hc_bytealign (w[10], w[11], offset);
      w[25] = hc_bytealign (w[ 9], w[10], offset);
      w[24] = hc_bytealign (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign (w[46], w[47], offset);
      w[62] = hc_bytealign (w[45], w[46], offset);
      w[61] = hc_bytealign (w[44], w[45], offset);
      w[60] = hc_bytealign (w[43], w[44], offset);
      w[59] = hc_bytealign (w[42], w[43], offset);
      w[58] = hc_bytealign (w[41], w[42], offset);
      w[57] = hc_bytealign (w[40], w[41], offset);
      w[56] = hc_bytealign (w[39], w[40], offset);
      w[55] = hc_bytealign (w[38], w[39], offset);
      w[54] = hc_bytealign (w[37], w[38], offset);
      w[53] = hc_bytealign (w[36], w[37], offset);
      w[52] = hc_bytealign (w[35], w[36], offset);
      w[51] = hc_bytealign (w[34], w[35], offset);
      w[50] = hc_bytealign (w[33], w[34], offset);
      w[49] = hc_bytealign (w[32], w[33], offset);
      w[48] = hc_bytealign (w[31], w[32], offset);
      w[47] = hc_bytealign (w[30], w[31], offset);
      w[46] = hc_bytealign (w[29], w[30], offset);
      w[45] = hc_bytealign (w[28], w[29], offset);
      w[44] = hc_bytealign (w[27], w[28], offset);
      w[43] = hc_bytealign (w[26], w[27], offset);
      w[42] = hc_bytealign (w[25], w[26], offset);
      w[41] = hc_bytealign (w[24], w[25], offset);
      w[40] = hc_bytealign (w[23], w[24], offset);
      w[39] = hc_bytealign (w[22], w[23], offset);
      w[38] = hc_bytealign (w[21], w[22], offset);
      w[37] = hc_bytealign (w[20], w[21], offset);
      w[36] = hc_bytealign (w[19], w[20], offset);
      w[35] = hc_bytealign (w[18], w[19], offset);
      w[34] = hc_bytealign (w[17], w[18], offset);
      w[33] = hc_bytealign (w[16], w[17], offset);
      w[32] = hc_bytealign (w[15], w[16], offset);
      w[31] = hc_bytealign (w[14], w[15], offset);
      w[30] = hc_bytealign (w[13], w[14], offset);
      w[29] = hc_bytealign (w[12], w[13], offset);
      w[28] = hc_bytealign (w[11], w[12], offset);
      w[27] = hc_bytealign (w[10], w[11], offset);
      w[26] = hc_bytealign (w[ 9], w[10], offset);
      w[25] = hc_bytealign (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign (w[45], w[46], offset);
      w[62] = hc_bytealign (w[44], w[45], offset);
      w[61] = hc_bytealign (w[43], w[44], offset);
      w[60] = hc_bytealign (w[42], w[43], offset);
      w[59] = hc_bytealign (w[41], w[42], offset);
      w[58] = hc_bytealign (w[40], w[41], offset);
      w[57] = hc_bytealign (w[39], w[40], offset);
      w[56] = hc_bytealign (w[38], w[39], offset);
      w[55] = hc_bytealign (w[37], w[38], offset);
      w[54] = hc_bytealign (w[36], w[37], offset);
      w[53] = hc_bytealign (w[35], w[36], offset);
      w[52] = hc_bytealign (w[34], w[35], offset);
      w[51] = hc_bytealign (w[33], w[34], offset);
      w[50] = hc_bytealign (w[32], w[33], offset);
      w[49] = hc_bytealign (w[31], w[32], offset);
      w[48] = hc_bytealign (w[30], w[31], offset);
      w[47] = hc_bytealign (w[29], w[30], offset);
      w[46] = hc_bytealign (w[28], w[29], offset);
      w[45] = hc_bytealign (w[27], w[28], offset);
      w[44] = hc_bytealign (w[26], w[27], offset);
      w[43] = hc_bytealign (w[25], w[26], offset);
      w[42] = hc_bytealign (w[24], w[25], offset);
      w[41] = hc_bytealign (w[23], w[24], offset);
      w[40] = hc_bytealign (w[22], w[23], offset);
      w[39] = hc_bytealign (w[21], w[22], offset);
      w[38] = hc_bytealign (w[20], w[21], offset);
      w[37] = hc_bytealign (w[19], w[20], offset);
      w[36] = hc_bytealign (w[18], w[19], offset);
      w[35] = hc_bytealign (w[17], w[18], offset);
      w[34] = hc_bytealign (w[16], w[17], offset);
      w[33] = hc_bytealign (w[15], w[16], offset);
      w[32] = hc_bytealign (w[14], w[15], offset);
      w[31] = hc_bytealign (w[13], w[14], offset);
      w[30] = hc_bytealign (w[12], w[13], offset);
      w[29] = hc_bytealign (w[11], w[12], offset);
      w[28] = hc_bytealign (w[10], w[11], offset);
      w[27] = hc_bytealign (w[ 9], w[10], offset);
      w[26] = hc_bytealign (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign (w[44], w[45], offset);
      w[62] = hc_bytealign (w[43], w[44], offset);
      w[61] = hc_bytealign (w[42], w[43], offset);
      w[60] = hc_bytealign (w[41], w[42], offset);
      w[59] = hc_bytealign (w[40], w[41], offset);
      w[58] = hc_bytealign (w[39], w[40], offset);
      w[57] = hc_bytealign (w[38], w[39], offset);
      w[56] = hc_bytealign (w[37], w[38], offset);
      w[55] = hc_bytealign (w[36], w[37], offset);
      w[54] = hc_bytealign (w[35], w[36], offset);
      w[53] = hc_bytealign (w[34], w[35], offset);
      w[52] = hc_bytealign (w[33], w[34], offset);
      w[51] = hc_bytealign (w[32], w[33], offset);
      w[50] = hc_bytealign (w[31], w[32], offset);
      w[49] = hc_bytealign (w[30], w[31], offset);
      w[48] = hc_bytealign (w[29], w[30], offset);
      w[47] = hc_bytealign (w[28], w[29], offset);
      w[46] = hc_bytealign (w[27], w[28], offset);
      w[45] = hc_bytealign (w[26], w[27], offset);
      w[44] = hc_bytealign (w[25], w[26], offset);
      w[43] = hc_bytealign (w[24], w[25], offset);
      w[42] = hc_bytealign (w[23], w[24], offset);
      w[41] = hc_bytealign (w[22], w[23], offset);
      w[40] = hc_bytealign (w[21], w[22], offset);
      w[39] = hc_bytealign (w[20], w[21], offset);
      w[38] = hc_bytealign (w[19], w[20], offset);
      w[37] = hc_bytealign (w[18], w[19], offset);
      w[36] = hc_bytealign (w[17], w[18], offset);
      w[35] = hc_bytealign (w[16], w[17], offset);
      w[34] = hc_bytealign (w[15], w[16], offset);
      w[33] = hc_bytealign (w[14], w[15], offset);
      w[32] = hc_bytealign (w[13], w[14], offset);
      w[31] = hc_bytealign (w[12], w[13], offset);
      w[30] = hc_bytealign (w[11], w[12], offset);
      w[29] = hc_bytealign (w[10], w[11], offset);
      w[28] = hc_bytealign (w[ 9], w[10], offset);
      w[27] = hc_bytealign (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign (w[43], w[44], offset);
      w[62] = hc_bytealign (w[42], w[43], offset);
      w[61] = hc_bytealign (w[41], w[42], offset);
      w[60] = hc_bytealign (w[40], w[41], offset);
      w[59] = hc_bytealign (w[39], w[40], offset);
      w[58] = hc_bytealign (w[38], w[39], offset);
      w[57] = hc_bytealign (w[37], w[38], offset);
      w[56] = hc_bytealign (w[36], w[37], offset);
      w[55] = hc_bytealign (w[35], w[36], offset);
      w[54] = hc_bytealign (w[34], w[35], offset);
      w[53] = hc_bytealign (w[33], w[34], offset);
      w[52] = hc_bytealign (w[32], w[33], offset);
      w[51] = hc_bytealign (w[31], w[32], offset);
      w[50] = hc_bytealign (w[30], w[31], offset);
      w[49] = hc_bytealign (w[29], w[30], offset);
      w[48] = hc_bytealign (w[28], w[29], offset);
      w[47] = hc_bytealign (w[27], w[28], offset);
      w[46] = hc_bytealign (w[26], w[27], offset);
      w[45] = hc_bytealign (w[25], w[26], offset);
      w[44] = hc_bytealign (w[24], w[25], offset);
      w[43] = hc_bytealign (w[23], w[24], offset);
      w[42] = hc_bytealign (w[22], w[23], offset);
      w[41] = hc_bytealign (w[21], w[22], offset);
      w[40] = hc_bytealign (w[20], w[21], offset);
      w[39] = hc_bytealign (w[19], w[20], offset);
      w[38] = hc_bytealign (w[18], w[19], offset);
      w[37] = hc_bytealign (w[17], w[18], offset);
      w[36] = hc_bytealign (w[16], w[17], offset);
      w[35] = hc_bytealign (w[15], w[16], offset);
      w[34] = hc_bytealign (w[14], w[15], offset);
      w[33] = hc_bytealign (w[13], w[14], offset);
      w[32] = hc_bytealign (w[12], w[13], offset);
      w[31] = hc_bytealign (w[11], w[12], offset);
      w[30] = hc_bytealign (w[10], w[11], offset);
      w[29] = hc_bytealign (w[ 9], w[10], offset);
      w[28] = hc_bytealign (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign (w[42], w[43], offset);
      w[62] = hc_bytealign (w[41], w[42], offset);
      w[61] = hc_bytealign (w[40], w[41], offset);
      w[60] = hc_bytealign (w[39], w[40], offset);
      w[59] = hc_bytealign (w[38], w[39], offset);
      w[58] = hc_bytealign (w[37], w[38], offset);
      w[57] = hc_bytealign (w[36], w[37], offset);
      w[56] = hc_bytealign (w[35], w[36], offset);
      w[55] = hc_bytealign (w[34], w[35], offset);
      w[54] = hc_bytealign (w[33], w[34], offset);
      w[53] = hc_bytealign (w[32], w[33], offset);
      w[52] = hc_bytealign (w[31], w[32], offset);
      w[51] = hc_bytealign (w[30], w[31], offset);
      w[50] = hc_bytealign (w[29], w[30], offset);
      w[49] = hc_bytealign (w[28], w[29], offset);
      w[48] = hc_bytealign (w[27], w[28], offset);
      w[47] = hc_bytealign (w[26], w[27], offset);
      w[46] = hc_bytealign (w[25], w[26], offset);
      w[45] = hc_bytealign (w[24], w[25], offset);
      w[44] = hc_bytealign (w[23], w[24], offset);
      w[43] = hc_bytealign (w[22], w[23], offset);
      w[42] = hc_bytealign (w[21], w[22], offset);
      w[41] = hc_bytealign (w[20], w[21], offset);
      w[40] = hc_bytealign (w[19], w[20], offset);
      w[39] = hc_bytealign (w[18], w[19], offset);
      w[38] = hc_bytealign (w[17], w[18], offset);
      w[37] = hc_bytealign (w[16], w[17], offset);
      w[36] = hc_bytealign (w[15], w[16], offset);
      w[35] = hc_bytealign (w[14], w[15], offset);
      w[34] = hc_bytealign (w[13], w[14], offset);
      w[33] = hc_bytealign (w[12], w[13], offset);
      w[32] = hc_bytealign (w[11], w[12], offset);
      w[31] = hc_bytealign (w[10], w[11], offset);
      w[30] = hc_bytealign (w[ 9], w[10], offset);
      w[29] = hc_bytealign (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign (w[41], w[42], offset);
      w[62] = hc_bytealign (w[40], w[41], offset);
      w[61] = hc_bytealign (w[39], w[40], offset);
      w[60] = hc_bytealign (w[38], w[39], offset);
      w[59] = hc_bytealign (w[37], w[38], offset);
      w[58] = hc_bytealign (w[36], w[37], offset);
      w[57] = hc_bytealign (w[35], w[36], offset);
      w[56] = hc_bytealign (w[34], w[35], offset);
      w[55] = hc_bytealign (w[33], w[34], offset);
      w[54] = hc_bytealign (w[32], w[33], offset);
      w[53] = hc_bytealign (w[31], w[32], offset);
      w[52] = hc_bytealign (w[30], w[31], offset);
      w[51] = hc_bytealign (w[29], w[30], offset);
      w[50] = hc_bytealign (w[28], w[29], offset);
      w[49] = hc_bytealign (w[27], w[28], offset);
      w[48] = hc_bytealign (w[26], w[27], offset);
      w[47] = hc_bytealign (w[25], w[26], offset);
      w[46] = hc_bytealign (w[24], w[25], offset);
      w[45] = hc_bytealign (w[23], w[24], offset);
      w[44] = hc_bytealign (w[22], w[23], offset);
      w[43] = hc_bytealign (w[21], w[22], offset);
      w[42] = hc_bytealign (w[20], w[21], offset);
      w[41] = hc_bytealign (w[19], w[20], offset);
      w[40] = hc_bytealign (w[18], w[19], offset);
      w[39] = hc_bytealign (w[17], w[18], offset);
      w[38] = hc_bytealign (w[16], w[17], offset);
      w[37] = hc_bytealign (w[15], w[16], offset);
      w[36] = hc_bytealign (w[14], w[15], offset);
      w[35] = hc_bytealign (w[13], w[14], offset);
      w[34] = hc_bytealign (w[12], w[13], offset);
      w[33] = hc_bytealign (w[11], w[12], offset);
      w[32] = hc_bytealign (w[10], w[11], offset);
      w[31] = hc_bytealign (w[ 9], w[10], offset);
      w[30] = hc_bytealign (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign (w[40], w[41], offset);
      w[62] = hc_bytealign (w[39], w[40], offset);
      w[61] = hc_bytealign (w[38], w[39], offset);
      w[60] = hc_bytealign (w[37], w[38], offset);
      w[59] = hc_bytealign (w[36], w[37], offset);
      w[58] = hc_bytealign (w[35], w[36], offset);
      w[57] = hc_bytealign (w[34], w[35], offset);
      w[56] = hc_bytealign (w[33], w[34], offset);
      w[55] = hc_bytealign (w[32], w[33], offset);
      w[54] = hc_bytealign (w[31], w[32], offset);
      w[53] = hc_bytealign (w[30], w[31], offset);
      w[52] = hc_bytealign (w[29], w[30], offset);
      w[51] = hc_bytealign (w[28], w[29], offset);
      w[50] = hc_bytealign (w[27], w[28], offset);
      w[49] = hc_bytealign (w[26], w[27], offset);
      w[48] = hc_bytealign (w[25], w[26], offset);
      w[47] = hc_bytealign (w[24], w[25], offset);
      w[46] = hc_bytealign (w[23], w[24], offset);
      w[45] = hc_bytealign (w[22], w[23], offset);
      w[44] = hc_bytealign (w[21], w[22], offset);
      w[43] = hc_bytealign (w[20], w[21], offset);
      w[42] = hc_bytealign (w[19], w[20], offset);
      w[41] = hc_bytealign (w[18], w[19], offset);
      w[40] = hc_bytealign (w[17], w[18], offset);
      w[39] = hc_bytealign (w[16], w[17], offset);
      w[38] = hc_bytealign (w[15], w[16], offset);
      w[37] = hc_bytealign (w[14], w[15], offset);
      w[36] = hc_bytealign (w[13], w[14], offset);
      w[35] = hc_bytealign (w[12], w[13], offset);
      w[34] = hc_bytealign (w[11], w[12], offset);
      w[33] = hc_bytealign (w[10], w[11], offset);
      w[32] = hc_bytealign (w[ 9], w[10], offset);
      w[31] = hc_bytealign (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign (w[39], w[40], offset);
      w[62] = hc_bytealign (w[38], w[39], offset);
      w[61] = hc_bytealign (w[37], w[38], offset);
      w[60] = hc_bytealign (w[36], w[37], offset);
      w[59] = hc_bytealign (w[35], w[36], offset);
      w[58] = hc_bytealign (w[34], w[35], offset);
      w[57] = hc_bytealign (w[33], w[34], offset);
      w[56] = hc_bytealign (w[32], w[33], offset);
      w[55] = hc_bytealign (w[31], w[32], offset);
      w[54] = hc_bytealign (w[30], w[31], offset);
      w[53] = hc_bytealign (w[29], w[30], offset);
      w[52] = hc_bytealign (w[28], w[29], offset);
      w[51] = hc_bytealign (w[27], w[28], offset);
      w[50] = hc_bytealign (w[26], w[27], offset);
      w[49] = hc_bytealign (w[25], w[26], offset);
      w[48] = hc_bytealign (w[24], w[25], offset);
      w[47] = hc_bytealign (w[23], w[24], offset);
      w[46] = hc_bytealign (w[22], w[23], offset);
      w[45] = hc_bytealign (w[21], w[22], offset);
      w[44] = hc_bytealign (w[20], w[21], offset);
      w[43] = hc_bytealign (w[19], w[20], offset);
      w[42] = hc_bytealign (w[18], w[19], offset);
      w[41] = hc_bytealign (w[17], w[18], offset);
      w[40] = hc_bytealign (w[16], w[17], offset);
      w[39] = hc_bytealign (w[15], w[16], offset);
      w[38] = hc_bytealign (w[14], w[15], offset);
      w[37] = hc_bytealign (w[13], w[14], offset);
      w[36] = hc_bytealign (w[12], w[13], offset);
      w[35] = hc_bytealign (w[11], w[12], offset);
      w[34] = hc_bytealign (w[10], w[11], offset);
      w[33] = hc_bytealign (w[ 9], w[10], offset);
      w[32] = hc_bytealign (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign (w[38], w[39], offset);
      w[62] = hc_bytealign (w[37], w[38], offset);
      w[61] = hc_bytealign (w[36], w[37], offset);
      w[60] = hc_bytealign (w[35], w[36], offset);
      w[59] = hc_bytealign (w[34], w[35], offset);
      w[58] = hc_bytealign (w[33], w[34], offset);
      w[57] = hc_bytealign (w[32], w[33], offset);
      w[56] = hc_bytealign (w[31], w[32], offset);
      w[55] = hc_bytealign (w[30], w[31], offset);
      w[54] = hc_bytealign (w[29], w[30], offset);
      w[53] = hc_bytealign (w[28], w[29], offset);
      w[52] = hc_bytealign (w[27], w[28], offset);
      w[51] = hc_bytealign (w[26], w[27], offset);
      w[50] = hc_bytealign (w[25], w[26], offset);
      w[49] = hc_bytealign (w[24], w[25], offset);
      w[48] = hc_bytealign (w[23], w[24], offset);
      w[47] = hc_bytealign (w[22], w[23], offset);
      w[46] = hc_bytealign (w[21], w[22], offset);
      w[45] = hc_bytealign (w[20], w[21], offset);
      w[44] = hc_bytealign (w[19], w[20], offset);
      w[43] = hc_bytealign (w[18], w[19], offset);
      w[42] = hc_bytealign (w[17], w[18], offset);
      w[41] = hc_bytealign (w[16], w[17], offset);
      w[40] = hc_bytealign (w[15], w[16], offset);
      w[39] = hc_bytealign (w[14], w[15], offset);
      w[38] = hc_bytealign (w[13], w[14], offset);
      w[37] = hc_bytealign (w[12], w[13], offset);
      w[36] = hc_bytealign (w[11], w[12], offset);
      w[35] = hc_bytealign (w[10], w[11], offset);
      w[34] = hc_bytealign (w[ 9], w[10], offset);
      w[33] = hc_bytealign (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign (w[37], w[38], offset);
      w[62] = hc_bytealign (w[36], w[37], offset);
      w[61] = hc_bytealign (w[35], w[36], offset);
      w[60] = hc_bytealign (w[34], w[35], offset);
      w[59] = hc_bytealign (w[33], w[34], offset);
      w[58] = hc_bytealign (w[32], w[33], offset);
      w[57] = hc_bytealign (w[31], w[32], offset);
      w[56] = hc_bytealign (w[30], w[31], offset);
      w[55] = hc_bytealign (w[29], w[30], offset);
      w[54] = hc_bytealign (w[28], w[29], offset);
      w[53] = hc_bytealign (w[27], w[28], offset);
      w[52] = hc_bytealign (w[26], w[27], offset);
      w[51] = hc_bytealign (w[25], w[26], offset);
      w[50] = hc_bytealign (w[24], w[25], offset);
      w[49] = hc_bytealign (w[23], w[24], offset);
      w[48] = hc_bytealign (w[22], w[23], offset);
      w[47] = hc_bytealign (w[21], w[22], offset);
      w[46] = hc_bytealign (w[20], w[21], offset);
      w[45] = hc_bytealign (w[19], w[20], offset);
      w[44] = hc_bytealign (w[18], w[19], offset);
      w[43] = hc_bytealign (w[17], w[18], offset);
      w[42] = hc_bytealign (w[16], w[17], offset);
      w[41] = hc_bytealign (w[15], w[16], offset);
      w[40] = hc_bytealign (w[14], w[15], offset);
      w[39] = hc_bytealign (w[13], w[14], offset);
      w[38] = hc_bytealign (w[12], w[13], offset);
      w[37] = hc_bytealign (w[11], w[12], offset);
      w[36] = hc_bytealign (w[10], w[11], offset);
      w[35] = hc_bytealign (w[ 9], w[10], offset);
      w[34] = hc_bytealign (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign (w[36], w[37], offset);
      w[62] = hc_bytealign (w[35], w[36], offset);
      w[61] = hc_bytealign (w[34], w[35], offset);
      w[60] = hc_bytealign (w[33], w[34], offset);
      w[59] = hc_bytealign (w[32], w[33], offset);
      w[58] = hc_bytealign (w[31], w[32], offset);
      w[57] = hc_bytealign (w[30], w[31], offset);
      w[56] = hc_bytealign (w[29], w[30], offset);
      w[55] = hc_bytealign (w[28], w[29], offset);
      w[54] = hc_bytealign (w[27], w[28], offset);
      w[53] = hc_bytealign (w[26], w[27], offset);
      w[52] = hc_bytealign (w[25], w[26], offset);
      w[51] = hc_bytealign (w[24], w[25], offset);
      w[50] = hc_bytealign (w[23], w[24], offset);
      w[49] = hc_bytealign (w[22], w[23], offset);
      w[48] = hc_bytealign (w[21], w[22], offset);
      w[47] = hc_bytealign (w[20], w[21], offset);
      w[46] = hc_bytealign (w[19], w[20], offset);
      w[45] = hc_bytealign (w[18], w[19], offset);
      w[44] = hc_bytealign (w[17], w[18], offset);
      w[43] = hc_bytealign (w[16], w[17], offset);
      w[42] = hc_bytealign (w[15], w[16], offset);
      w[41] = hc_bytealign (w[14], w[15], offset);
      w[40] = hc_bytealign (w[13], w[14], offset);
      w[39] = hc_bytealign (w[12], w[13], offset);
      w[38] = hc_bytealign (w[11], w[12], offset);
      w[37] = hc_bytealign (w[10], w[11], offset);
      w[36] = hc_bytealign (w[ 9], w[10], offset);
      w[35] = hc_bytealign (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign (w[35], w[36], offset);
      w[62] = hc_bytealign (w[34], w[35], offset);
      w[61] = hc_bytealign (w[33], w[34], offset);
      w[60] = hc_bytealign (w[32], w[33], offset);
      w[59] = hc_bytealign (w[31], w[32], offset);
      w[58] = hc_bytealign (w[30], w[31], offset);
      w[57] = hc_bytealign (w[29], w[30], offset);
      w[56] = hc_bytealign (w[28], w[29], offset);
      w[55] = hc_bytealign (w[27], w[28], offset);
      w[54] = hc_bytealign (w[26], w[27], offset);
      w[53] = hc_bytealign (w[25], w[26], offset);
      w[52] = hc_bytealign (w[24], w[25], offset);
      w[51] = hc_bytealign (w[23], w[24], offset);
      w[50] = hc_bytealign (w[22], w[23], offset);
      w[49] = hc_bytealign (w[21], w[22], offset);
      w[48] = hc_bytealign (w[20], w[21], offset);
      w[47] = hc_bytealign (w[19], w[20], offset);
      w[46] = hc_bytealign (w[18], w[19], offset);
      w[45] = hc_bytealign (w[17], w[18], offset);
      w[44] = hc_bytealign (w[16], w[17], offset);
      w[43] = hc_bytealign (w[15], w[16], offset);
      w[42] = hc_bytealign (w[14], w[15], offset);
      w[41] = hc_bytealign (w[13], w[14], offset);
      w[40] = hc_bytealign (w[12], w[13], offset);
      w[39] = hc_bytealign (w[11], w[12], offset);
      w[38] = hc_bytealign (w[10], w[11], offset);
      w[37] = hc_bytealign (w[ 9], w[10], offset);
      w[36] = hc_bytealign (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign (w[34], w[35], offset);
      w[62] = hc_bytealign (w[33], w[34], offset);
      w[61] = hc_bytealign (w[32], w[33], offset);
      w[60] = hc_bytealign (w[31], w[32], offset);
      w[59] = hc_bytealign (w[30], w[31], offset);
      w[58] = hc_bytealign (w[29], w[30], offset);
      w[57] = hc_bytealign (w[28], w[29], offset);
      w[56] = hc_bytealign (w[27], w[28], offset);
      w[55] = hc_bytealign (w[26], w[27], offset);
      w[54] = hc_bytealign (w[25], w[26], offset);
      w[53] = hc_bytealign (w[24], w[25], offset);
      w[52] = hc_bytealign (w[23], w[24], offset);
      w[51] = hc_bytealign (w[22], w[23], offset);
      w[50] = hc_bytealign (w[21], w[22], offset);
      w[49] = hc_bytealign (w[20], w[21], offset);
      w[48] = hc_bytealign (w[19], w[20], offset);
      w[47] = hc_bytealign (w[18], w[19], offset);
      w[46] = hc_bytealign (w[17], w[18], offset);
      w[45] = hc_bytealign (w[16], w[17], offset);
      w[44] = hc_bytealign (w[15], w[16], offset);
      w[43] = hc_bytealign (w[14], w[15], offset);
      w[42] = hc_bytealign (w[13], w[14], offset);
      w[41] = hc_bytealign (w[12], w[13], offset);
      w[40] = hc_bytealign (w[11], w[12], offset);
      w[39] = hc_bytealign (w[10], w[11], offset);
      w[38] = hc_bytealign (w[ 9], w[10], offset);
      w[37] = hc_bytealign (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign (w[33], w[34], offset);
      w[62] = hc_bytealign (w[32], w[33], offset);
      w[61] = hc_bytealign (w[31], w[32], offset);
      w[60] = hc_bytealign (w[30], w[31], offset);
      w[59] = hc_bytealign (w[29], w[30], offset);
      w[58] = hc_bytealign (w[28], w[29], offset);
      w[57] = hc_bytealign (w[27], w[28], offset);
      w[56] = hc_bytealign (w[26], w[27], offset);
      w[55] = hc_bytealign (w[25], w[26], offset);
      w[54] = hc_bytealign (w[24], w[25], offset);
      w[53] = hc_bytealign (w[23], w[24], offset);
      w[52] = hc_bytealign (w[22], w[23], offset);
      w[51] = hc_bytealign (w[21], w[22], offset);
      w[50] = hc_bytealign (w[20], w[21], offset);
      w[49] = hc_bytealign (w[19], w[20], offset);
      w[48] = hc_bytealign (w[18], w[19], offset);
      w[47] = hc_bytealign (w[17], w[18], offset);
      w[46] = hc_bytealign (w[16], w[17], offset);
      w[45] = hc_bytealign (w[15], w[16], offset);
      w[44] = hc_bytealign (w[14], w[15], offset);
      w[43] = hc_bytealign (w[13], w[14], offset);
      w[42] = hc_bytealign (w[12], w[13], offset);
      w[41] = hc_bytealign (w[11], w[12], offset);
      w[40] = hc_bytealign (w[10], w[11], offset);
      w[39] = hc_bytealign (w[ 9], w[10], offset);
      w[38] = hc_bytealign (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign (w[32], w[33], offset);
      w[62] = hc_bytealign (w[31], w[32], offset);
      w[61] = hc_bytealign (w[30], w[31], offset);
      w[60] = hc_bytealign (w[29], w[30], offset);
      w[59] = hc_bytealign (w[28], w[29], offset);
      w[58] = hc_bytealign (w[27], w[28], offset);
      w[57] = hc_bytealign (w[26], w[27], offset);
      w[56] = hc_bytealign (w[25], w[26], offset);
      w[55] = hc_bytealign (w[24], w[25], offset);
      w[54] = hc_bytealign (w[23], w[24], offset);
      w[53] = hc_bytealign (w[22], w[23], offset);
      w[52] = hc_bytealign (w[21], w[22], offset);
      w[51] = hc_bytealign (w[20], w[21], offset);
      w[50] = hc_bytealign (w[19], w[20], offset);
      w[49] = hc_bytealign (w[18], w[19], offset);
      w[48] = hc_bytealign (w[17], w[18], offset);
      w[47] = hc_bytealign (w[16], w[17], offset);
      w[46] = hc_bytealign (w[15], w[16], offset);
      w[45] = hc_bytealign (w[14], w[15], offset);
      w[44] = hc_bytealign (w[13], w[14], offset);
      w[43] = hc_bytealign (w[12], w[13], offset);
      w[42] = hc_bytealign (w[11], w[12], offset);
      w[41] = hc_bytealign (w[10], w[11], offset);
      w[40] = hc_bytealign (w[ 9], w[10], offset);
      w[39] = hc_bytealign (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign (w[31], w[32], offset);
      w[62] = hc_bytealign (w[30], w[31], offset);
      w[61] = hc_bytealign (w[29], w[30], offset);
      w[60] = hc_bytealign (w[28], w[29], offset);
      w[59] = hc_bytealign (w[27], w[28], offset);
      w[58] = hc_bytealign (w[26], w[27], offset);
      w[57] = hc_bytealign (w[25], w[26], offset);
      w[56] = hc_bytealign (w[24], w[25], offset);
      w[55] = hc_bytealign (w[23], w[24], offset);
      w[54] = hc_bytealign (w[22], w[23], offset);
      w[53] = hc_bytealign (w[21], w[22], offset);
      w[52] = hc_bytealign (w[20], w[21], offset);
      w[51] = hc_bytealign (w[19], w[20], offset);
      w[50] = hc_bytealign (w[18], w[19], offset);
      w[49] = hc_bytealign (w[17], w[18], offset);
      w[48] = hc_bytealign (w[16], w[17], offset);
      w[47] = hc_bytealign (w[15], w[16], offset);
      w[46] = hc_bytealign (w[14], w[15], offset);
      w[45] = hc_bytealign (w[13], w[14], offset);
      w[44] = hc_bytealign (w[12], w[13], offset);
      w[43] = hc_bytealign (w[11], w[12], offset);
      w[42] = hc_bytealign (w[10], w[11], offset);
      w[41] = hc_bytealign (w[ 9], w[10], offset);
      w[40] = hc_bytealign (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign (w[30], w[31], offset);
      w[62] = hc_bytealign (w[29], w[30], offset);
      w[61] = hc_bytealign (w[28], w[29], offset);
      w[60] = hc_bytealign (w[27], w[28], offset);
      w[59] = hc_bytealign (w[26], w[27], offset);
      w[58] = hc_bytealign (w[25], w[26], offset);
      w[57] = hc_bytealign (w[24], w[25], offset);
      w[56] = hc_bytealign (w[23], w[24], offset);
      w[55] = hc_bytealign (w[22], w[23], offset);
      w[54] = hc_bytealign (w[21], w[22], offset);
      w[53] = hc_bytealign (w[20], w[21], offset);
      w[52] = hc_bytealign (w[19], w[20], offset);
      w[51] = hc_bytealign (w[18], w[19], offset);
      w[50] = hc_bytealign (w[17], w[18], offset);
      w[49] = hc_bytealign (w[16], w[17], offset);
      w[48] = hc_bytealign (w[15], w[16], offset);
      w[47] = hc_bytealign (w[14], w[15], offset);
      w[46] = hc_bytealign (w[13], w[14], offset);
      w[45] = hc_bytealign (w[12], w[13], offset);
      w[44] = hc_bytealign (w[11], w[12], offset);
      w[43] = hc_bytealign (w[10], w[11], offset);
      w[42] = hc_bytealign (w[ 9], w[10], offset);
      w[41] = hc_bytealign (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign (w[29], w[30], offset);
      w[62] = hc_bytealign (w[28], w[29], offset);
      w[61] = hc_bytealign (w[27], w[28], offset);
      w[60] = hc_bytealign (w[26], w[27], offset);
      w[59] = hc_bytealign (w[25], w[26], offset);
      w[58] = hc_bytealign (w[24], w[25], offset);
      w[57] = hc_bytealign (w[23], w[24], offset);
      w[56] = hc_bytealign (w[22], w[23], offset);
      w[55] = hc_bytealign (w[21], w[22], offset);
      w[54] = hc_bytealign (w[20], w[21], offset);
      w[53] = hc_bytealign (w[19], w[20], offset);
      w[52] = hc_bytealign (w[18], w[19], offset);
      w[51] = hc_bytealign (w[17], w[18], offset);
      w[50] = hc_bytealign (w[16], w[17], offset);
      w[49] = hc_bytealign (w[15], w[16], offset);
      w[48] = hc_bytealign (w[14], w[15], offset);
      w[47] = hc_bytealign (w[13], w[14], offset);
      w[46] = hc_bytealign (w[12], w[13], offset);
      w[45] = hc_bytealign (w[11], w[12], offset);
      w[44] = hc_bytealign (w[10], w[11], offset);
      w[43] = hc_bytealign (w[ 9], w[10], offset);
      w[42] = hc_bytealign (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign (w[28], w[29], offset);
      w[62] = hc_bytealign (w[27], w[28], offset);
      w[61] = hc_bytealign (w[26], w[27], offset);
      w[60] = hc_bytealign (w[25], w[26], offset);
      w[59] = hc_bytealign (w[24], w[25], offset);
      w[58] = hc_bytealign (w[23], w[24], offset);
      w[57] = hc_bytealign (w[22], w[23], offset);
      w[56] = hc_bytealign (w[21], w[22], offset);
      w[55] = hc_bytealign (w[20], w[21], offset);
      w[54] = hc_bytealign (w[19], w[20], offset);
      w[53] = hc_bytealign (w[18], w[19], offset);
      w[52] = hc_bytealign (w[17], w[18], offset);
      w[51] = hc_bytealign (w[16], w[17], offset);
      w[50] = hc_bytealign (w[15], w[16], offset);
      w[49] = hc_bytealign (w[14], w[15], offset);
      w[48] = hc_bytealign (w[13], w[14], offset);
      w[47] = hc_bytealign (w[12], w[13], offset);
      w[46] = hc_bytealign (w[11], w[12], offset);
      w[45] = hc_bytealign (w[10], w[11], offset);
      w[44] = hc_bytealign (w[ 9], w[10], offset);
      w[43] = hc_bytealign (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign (w[27], w[28], offset);
      w[62] = hc_bytealign (w[26], w[27], offset);
      w[61] = hc_bytealign (w[25], w[26], offset);
      w[60] = hc_bytealign (w[24], w[25], offset);
      w[59] = hc_bytealign (w[23], w[24], offset);
      w[58] = hc_bytealign (w[22], w[23], offset);
      w[57] = hc_bytealign (w[21], w[22], offset);
      w[56] = hc_bytealign (w[20], w[21], offset);
      w[55] = hc_bytealign (w[19], w[20], offset);
      w[54] = hc_bytealign (w[18], w[19], offset);
      w[53] = hc_bytealign (w[17], w[18], offset);
      w[52] = hc_bytealign (w[16], w[17], offset);
      w[51] = hc_bytealign (w[15], w[16], offset);
      w[50] = hc_bytealign (w[14], w[15], offset);
      w[49] = hc_bytealign (w[13], w[14], offset);
      w[48] = hc_bytealign (w[12], w[13], offset);
      w[47] = hc_bytealign (w[11], w[12], offset);
      w[46] = hc_bytealign (w[10], w[11], offset);
      w[45] = hc_bytealign (w[ 9], w[10], offset);
      w[44] = hc_bytealign (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign (w[26], w[27], offset);
      w[62] = hc_bytealign (w[25], w[26], offset);
      w[61] = hc_bytealign (w[24], w[25], offset);
      w[60] = hc_bytealign (w[23], w[24], offset);
      w[59] = hc_bytealign (w[22], w[23], offset);
      w[58] = hc_bytealign (w[21], w[22], offset);
      w[57] = hc_bytealign (w[20], w[21], offset);
      w[56] = hc_bytealign (w[19], w[20], offset);
      w[55] = hc_bytealign (w[18], w[19], offset);
      w[54] = hc_bytealign (w[17], w[18], offset);
      w[53] = hc_bytealign (w[16], w[17], offset);
      w[52] = hc_bytealign (w[15], w[16], offset);
      w[51] = hc_bytealign (w[14], w[15], offset);
      w[50] = hc_bytealign (w[13], w[14], offset);
      w[49] = hc_bytealign (w[12], w[13], offset);
      w[48] = hc_bytealign (w[11], w[12], offset);
      w[47] = hc_bytealign (w[10], w[11], offset);
      w[46] = hc_bytealign (w[ 9], w[10], offset);
      w[45] = hc_bytealign (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign (w[25], w[26], offset);
      w[62] = hc_bytealign (w[24], w[25], offset);
      w[61] = hc_bytealign (w[23], w[24], offset);
      w[60] = hc_bytealign (w[22], w[23], offset);
      w[59] = hc_bytealign (w[21], w[22], offset);
      w[58] = hc_bytealign (w[20], w[21], offset);
      w[57] = hc_bytealign (w[19], w[20], offset);
      w[56] = hc_bytealign (w[18], w[19], offset);
      w[55] = hc_bytealign (w[17], w[18], offset);
      w[54] = hc_bytealign (w[16], w[17], offset);
      w[53] = hc_bytealign (w[15], w[16], offset);
      w[52] = hc_bytealign (w[14], w[15], offset);
      w[51] = hc_bytealign (w[13], w[14], offset);
      w[50] = hc_bytealign (w[12], w[13], offset);
      w[49] = hc_bytealign (w[11], w[12], offset);
      w[48] = hc_bytealign (w[10], w[11], offset);
      w[47] = hc_bytealign (w[ 9], w[10], offset);
      w[46] = hc_bytealign (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign (w[24], w[25], offset);
      w[62] = hc_bytealign (w[23], w[24], offset);
      w[61] = hc_bytealign (w[22], w[23], offset);
      w[60] = hc_bytealign (w[21], w[22], offset);
      w[59] = hc_bytealign (w[20], w[21], offset);
      w[58] = hc_bytealign (w[19], w[20], offset);
      w[57] = hc_bytealign (w[18], w[19], offset);
      w[56] = hc_bytealign (w[17], w[18], offset);
      w[55] = hc_bytealign (w[16], w[17], offset);
      w[54] = hc_bytealign (w[15], w[16], offset);
      w[53] = hc_bytealign (w[14], w[15], offset);
      w[52] = hc_bytealign (w[13], w[14], offset);
      w[51] = hc_bytealign (w[12], w[13], offset);
      w[50] = hc_bytealign (w[11], w[12], offset);
      w[49] = hc_bytealign (w[10], w[11], offset);
      w[48] = hc_bytealign (w[ 9], w[10], offset);
      w[47] = hc_bytealign (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign (w[23], w[24], offset);
      w[62] = hc_bytealign (w[22], w[23], offset);
      w[61] = hc_bytealign (w[21], w[22], offset);
      w[60] = hc_bytealign (w[20], w[21], offset);
      w[59] = hc_bytealign (w[19], w[20], offset);
      w[58] = hc_bytealign (w[18], w[19], offset);
      w[57] = hc_bytealign (w[17], w[18], offset);
      w[56] = hc_bytealign (w[16], w[17], offset);
      w[55] = hc_bytealign (w[15], w[16], offset);
      w[54] = hc_bytealign (w[14], w[15], offset);
      w[53] = hc_bytealign (w[13], w[14], offset);
      w[52] = hc_bytealign (w[12], w[13], offset);
      w[51] = hc_bytealign (w[11], w[12], offset);
      w[50] = hc_bytealign (w[10], w[11], offset);
      w[49] = hc_bytealign (w[ 9], w[10], offset);
      w[48] = hc_bytealign (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign (w[22], w[23], offset);
      w[62] = hc_bytealign (w[21], w[22], offset);
      w[61] = hc_bytealign (w[20], w[21], offset);
      w[60] = hc_bytealign (w[19], w[20], offset);
      w[59] = hc_bytealign (w[18], w[19], offset);
      w[58] = hc_bytealign (w[17], w[18], offset);
      w[57] = hc_bytealign (w[16], w[17], offset);
      w[56] = hc_bytealign (w[15], w[16], offset);
      w[55] = hc_bytealign (w[14], w[15], offset);
      w[54] = hc_bytealign (w[13], w[14], offset);
      w[53] = hc_bytealign (w[12], w[13], offset);
      w[52] = hc_bytealign (w[11], w[12], offset);
      w[51] = hc_bytealign (w[10], w[11], offset);
      w[50] = hc_bytealign (w[ 9], w[10], offset);
      w[49] = hc_bytealign (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign (w[21], w[22], offset);
      w[62] = hc_bytealign (w[20], w[21], offset);
      w[61] = hc_bytealign (w[19], w[20], offset);
      w[60] = hc_bytealign (w[18], w[19], offset);
      w[59] = hc_bytealign (w[17], w[18], offset);
      w[58] = hc_bytealign (w[16], w[17], offset);
      w[57] = hc_bytealign (w[15], w[16], offset);
      w[56] = hc_bytealign (w[14], w[15], offset);
      w[55] = hc_bytealign (w[13], w[14], offset);
      w[54] = hc_bytealign (w[12], w[13], offset);
      w[53] = hc_bytealign (w[11], w[12], offset);
      w[52] = hc_bytealign (w[10], w[11], offset);
      w[51] = hc_bytealign (w[ 9], w[10], offset);
      w[50] = hc_bytealign (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign (w[20], w[21], offset);
      w[62] = hc_bytealign (w[19], w[20], offset);
      w[61] = hc_bytealign (w[18], w[19], offset);
      w[60] = hc_bytealign (w[17], w[18], offset);
      w[59] = hc_bytealign (w[16], w[17], offset);
      w[58] = hc_bytealign (w[15], w[16], offset);
      w[57] = hc_bytealign (w[14], w[15], offset);
      w[56] = hc_bytealign (w[13], w[14], offset);
      w[55] = hc_bytealign (w[12], w[13], offset);
      w[54] = hc_bytealign (w[11], w[12], offset);
      w[53] = hc_bytealign (w[10], w[11], offset);
      w[52] = hc_bytealign (w[ 9], w[10], offset);
      w[51] = hc_bytealign (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign (w[19], w[20], offset);
      w[62] = hc_bytealign (w[18], w[19], offset);
      w[61] = hc_bytealign (w[17], w[18], offset);
      w[60] = hc_bytealign (w[16], w[17], offset);
      w[59] = hc_bytealign (w[15], w[16], offset);
      w[58] = hc_bytealign (w[14], w[15], offset);
      w[57] = hc_bytealign (w[13], w[14], offset);
      w[56] = hc_bytealign (w[12], w[13], offset);
      w[55] = hc_bytealign (w[11], w[12], offset);
      w[54] = hc_bytealign (w[10], w[11], offset);
      w[53] = hc_bytealign (w[ 9], w[10], offset);
      w[52] = hc_bytealign (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign (w[ 1], w[ 2], offset);
      w[44] = hc_bytealign (w[ 0], w[ 1], offset);
      w[43] = hc_bytealign (    0, w[ 0], offset);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_bytealign (w[18], w[19], offset);
      w[62] = hc_bytealign (w[17], w[18], offset);
      w[61] = hc_bytealign (w[16], w[17], offset);
      w[60] = hc_bytealign (w[15], w[16], offset);
      w[59] = hc_bytealign (w[14], w[15], offset);
      w[58] = hc_bytealign (w[13], w[14], offset);
      w[57] = hc_bytealign (w[12], w[13], offset);
      w[56] = hc_bytealign (w[11], w[12], offset);
      w[55] = hc_bytealign (w[10], w[11], offset);
      w[54] = hc_bytealign (w[ 9], w[10], offset);
      w[53] = hc_bytealign (w[ 8], w[ 9], offset);
      w[52] = hc_bytealign (w[ 7], w[ 8], offset);
      w[51] = hc_bytealign (w[ 6], w[ 7], offset);
      w[50] = hc_bytealign (w[ 5], w[ 6], offset);
      w[49] = hc_bytealign (w[ 4], w[ 5], offset);
      w[48] = hc_bytealign (w[ 3], w[ 4], offset);
      w[47] = hc_bytealign (w[ 2], w[ 3], offset);
      w[46] = hc_bytealign (w[ 1], w[ 2], offset);
      w[45] = hc_bytealign (w[ 0], w[ 1], offset);
      w[44] = hc_bytealign (    0, w[ 0], offset);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_bytealign (w[17], w[18], offset);
      w[62] = hc_bytealign (w[16], w[17], offset);
      w[61] = hc_bytealign (w[15], w[16], offset);
      w[60] = hc_bytealign (w[14], w[15], offset);
      w[59] = hc_bytealign (w[13], w[14], offset);
      w[58] = hc_bytealign (w[12], w[13], offset);
      w[57] = hc_bytealign (w[11], w[12], offset);
      w[56] = hc_bytealign (w[10], w[11], offset);
      w[55] = hc_bytealign (w[ 9], w[10], offset);
      w[54] = hc_bytealign (w[ 8], w[ 9], offset);
      w[53] = hc_bytealign (w[ 7], w[ 8], offset);
      w[52] = hc_bytealign (w[ 6], w[ 7], offset);
      w[51] = hc_bytealign (w[ 5], w[ 6], offset);
      w[50] = hc_bytealign (w[ 4], w[ 5], offset);
      w[49] = hc_bytealign (w[ 3], w[ 4], offset);
      w[48] = hc_bytealign (w[ 2], w[ 3], offset);
      w[47] = hc_bytealign (w[ 1], w[ 2], offset);
      w[46] = hc_bytealign (w[ 0], w[ 1], offset);
      w[45] = hc_bytealign (    0, w[ 0], offset);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_bytealign (w[16], w[17], offset);
      w[62] = hc_bytealign (w[15], w[16], offset);
      w[61] = hc_bytealign (w[14], w[15], offset);
      w[60] = hc_bytealign (w[13], w[14], offset);
      w[59] = hc_bytealign (w[12], w[13], offset);
      w[58] = hc_bytealign (w[11], w[12], offset);
      w[57] = hc_bytealign (w[10], w[11], offset);
      w[56] = hc_bytealign (w[ 9], w[10], offset);
      w[55] = hc_bytealign (w[ 8], w[ 9], offset);
      w[54] = hc_bytealign (w[ 7], w[ 8], offset);
      w[53] = hc_bytealign (w[ 6], w[ 7], offset);
      w[52] = hc_bytealign (w[ 5], w[ 6], offset);
      w[51] = hc_bytealign (w[ 4], w[ 5], offset);
      w[50] = hc_bytealign (w[ 3], w[ 4], offset);
      w[49] = hc_bytealign (w[ 2], w[ 3], offset);
      w[48] = hc_bytealign (w[ 1], w[ 2], offset);
      w[47] = hc_bytealign (w[ 0], w[ 1], offset);
      w[46] = hc_bytealign (    0, w[ 0], offset);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_bytealign (w[15], w[16], offset);
      w[62] = hc_bytealign (w[14], w[15], offset);
      w[61] = hc_bytealign (w[13], w[14], offset);
      w[60] = hc_bytealign (w[12], w[13], offset);
      w[59] = hc_bytealign (w[11], w[12], offset);
      w[58] = hc_bytealign (w[10], w[11], offset);
      w[57] = hc_bytealign (w[ 9], w[10], offset);
      w[56] = hc_bytealign (w[ 8], w[ 9], offset);
      w[55] = hc_bytealign (w[ 7], w[ 8], offset);
      w[54] = hc_bytealign (w[ 6], w[ 7], offset);
      w[53] = hc_bytealign (w[ 5], w[ 6], offset);
      w[52] = hc_bytealign (w[ 4], w[ 5], offset);
      w[51] = hc_bytealign (w[ 3], w[ 4], offset);
      w[50] = hc_bytealign (w[ 2], w[ 3], offset);
      w[49] = hc_bytealign (w[ 1], w[ 2], offset);
      w[48] = hc_bytealign (w[ 0], w[ 1], offset);
      w[47] = hc_bytealign (    0, w[ 0], offset);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_bytealign (w[14], w[15], offset);
      w[62] = hc_bytealign (w[13], w[14], offset);
      w[61] = hc_bytealign (w[12], w[13], offset);
      w[60] = hc_bytealign (w[11], w[12], offset);
      w[59] = hc_bytealign (w[10], w[11], offset);
      w[58] = hc_bytealign (w[ 9], w[10], offset);
      w[57] = hc_bytealign (w[ 8], w[ 9], offset);
      w[56] = hc_bytealign (w[ 7], w[ 8], offset);
      w[55] = hc_bytealign (w[ 6], w[ 7], offset);
      w[54] = hc_bytealign (w[ 5], w[ 6], offset);
      w[53] = hc_bytealign (w[ 4], w[ 5], offset);
      w[52] = hc_bytealign (w[ 3], w[ 4], offset);
      w[51] = hc_bytealign (w[ 2], w[ 3], offset);
      w[50] = hc_bytealign (w[ 1], w[ 2], offset);
      w[49] = hc_bytealign (w[ 0], w[ 1], offset);
      w[48] = hc_bytealign (    0, w[ 0], offset);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_bytealign (w[13], w[14], offset);
      w[62] = hc_bytealign (w[12], w[13], offset);
      w[61] = hc_bytealign (w[11], w[12], offset);
      w[60] = hc_bytealign (w[10], w[11], offset);
      w[59] = hc_bytealign (w[ 9], w[10], offset);
      w[58] = hc_bytealign (w[ 8], w[ 9], offset);
      w[57] = hc_bytealign (w[ 7], w[ 8], offset);
      w[56] = hc_bytealign (w[ 6], w[ 7], offset);
      w[55] = hc_bytealign (w[ 5], w[ 6], offset);
      w[54] = hc_bytealign (w[ 4], w[ 5], offset);
      w[53] = hc_bytealign (w[ 3], w[ 4], offset);
      w[52] = hc_bytealign (w[ 2], w[ 3], offset);
      w[51] = hc_bytealign (w[ 1], w[ 2], offset);
      w[50] = hc_bytealign (w[ 0], w[ 1], offset);
      w[49] = hc_bytealign (    0, w[ 0], offset);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_bytealign (w[12], w[13], offset);
      w[62] = hc_bytealign (w[11], w[12], offset);
      w[61] = hc_bytealign (w[10], w[11], offset);
      w[60] = hc_bytealign (w[ 9], w[10], offset);
      w[59] = hc_bytealign (w[ 8], w[ 9], offset);
      w[58] = hc_bytealign (w[ 7], w[ 8], offset);
      w[57] = hc_bytealign (w[ 6], w[ 7], offset);
      w[56] = hc_bytealign (w[ 5], w[ 6], offset);
      w[55] = hc_bytealign (w[ 4], w[ 5], offset);
      w[54] = hc_bytealign (w[ 3], w[ 4], offset);
      w[53] = hc_bytealign (w[ 2], w[ 3], offset);
      w[52] = hc_bytealign (w[ 1], w[ 2], offset);
      w[51] = hc_bytealign (w[ 0], w[ 1], offset);
      w[50] = hc_bytealign (    0, w[ 0], offset);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_bytealign (w[11], w[12], offset);
      w[62] = hc_bytealign (w[10], w[11], offset);
      w[61] = hc_bytealign (w[ 9], w[10], offset);
      w[60] = hc_bytealign (w[ 8], w[ 9], offset);
      w[59] = hc_bytealign (w[ 7], w[ 8], offset);
      w[58] = hc_bytealign (w[ 6], w[ 7], offset);
      w[57] = hc_bytealign (w[ 5], w[ 6], offset);
      w[56] = hc_bytealign (w[ 4], w[ 5], offset);
      w[55] = hc_bytealign (w[ 3], w[ 4], offset);
      w[54] = hc_bytealign (w[ 2], w[ 3], offset);
      w[53] = hc_bytealign (w[ 1], w[ 2], offset);
      w[52] = hc_bytealign (w[ 0], w[ 1], offset);
      w[51] = hc_bytealign (    0, w[ 0], offset);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_bytealign (w[10], w[11], offset);
      w[62] = hc_bytealign (w[ 9], w[10], offset);
      w[61] = hc_bytealign (w[ 8], w[ 9], offset);
      w[60] = hc_bytealign (w[ 7], w[ 8], offset);
      w[59] = hc_bytealign (w[ 6], w[ 7], offset);
      w[58] = hc_bytealign (w[ 5], w[ 6], offset);
      w[57] = hc_bytealign (w[ 4], w[ 5], offset);
      w[56] = hc_bytealign (w[ 3], w[ 4], offset);
      w[55] = hc_bytealign (w[ 2], w[ 3], offset);
      w[54] = hc_bytealign (w[ 1], w[ 2], offset);
      w[53] = hc_bytealign (w[ 0], w[ 1], offset);
      w[52] = hc_bytealign (    0, w[ 0], offset);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_bytealign (w[ 9], w[10], offset);
      w[62] = hc_bytealign (w[ 8], w[ 9], offset);
      w[61] = hc_bytealign (w[ 7], w[ 8], offset);
      w[60] = hc_bytealign (w[ 6], w[ 7], offset);
      w[59] = hc_bytealign (w[ 5], w[ 6], offset);
      w[58] = hc_bytealign (w[ 4], w[ 5], offset);
      w[57] = hc_bytealign (w[ 3], w[ 4], offset);
      w[56] = hc_bytealign (w[ 2], w[ 3], offset);
      w[55] = hc_bytealign (w[ 1], w[ 2], offset);
      w[54] = hc_bytealign (w[ 0], w[ 1], offset);
      w[53] = hc_bytealign (    0, w[ 0], offset);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_bytealign (w[ 8], w[ 9], offset);
      w[62] = hc_bytealign (w[ 7], w[ 8], offset);
      w[61] = hc_bytealign (w[ 6], w[ 7], offset);
      w[60] = hc_bytealign (w[ 5], w[ 6], offset);
      w[59] = hc_bytealign (w[ 4], w[ 5], offset);
      w[58] = hc_bytealign (w[ 3], w[ 4], offset);
      w[57] = hc_bytealign (w[ 2], w[ 3], offset);
      w[56] = hc_bytealign (w[ 1], w[ 2], offset);
      w[55] = hc_bytealign (w[ 0], w[ 1], offset);
      w[54] = hc_bytealign (    0, w[ 0], offset);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_bytealign (w[ 7], w[ 8], offset);
      w[62] = hc_bytealign (w[ 6], w[ 7], offset);
      w[61] = hc_bytealign (w[ 5], w[ 6], offset);
      w[60] = hc_bytealign (w[ 4], w[ 5], offset);
      w[59] = hc_bytealign (w[ 3], w[ 4], offset);
      w[58] = hc_bytealign (w[ 2], w[ 3], offset);
      w[57] = hc_bytealign (w[ 1], w[ 2], offset);
      w[56] = hc_bytealign (w[ 0], w[ 1], offset);
      w[55] = hc_bytealign (    0, w[ 0], offset);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_bytealign (w[ 6], w[ 7], offset);
      w[62] = hc_bytealign (w[ 5], w[ 6], offset);
      w[61] = hc_bytealign (w[ 4], w[ 5], offset);
      w[60] = hc_bytealign (w[ 3], w[ 4], offset);
      w[59] = hc_bytealign (w[ 2], w[ 3], offset);
      w[58] = hc_bytealign (w[ 1], w[ 2], offset);
      w[57] = hc_bytealign (w[ 0], w[ 1], offset);
      w[56] = hc_bytealign (    0, w[ 0], offset);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_bytealign (w[ 5], w[ 6], offset);
      w[62] = hc_bytealign (w[ 4], w[ 5], offset);
      w[61] = hc_bytealign (w[ 3], w[ 4], offset);
      w[60] = hc_bytealign (w[ 2], w[ 3], offset);
      w[59] = hc_bytealign (w[ 1], w[ 2], offset);
      w[58] = hc_bytealign (w[ 0], w[ 1], offset);
      w[57] = hc_bytealign (    0, w[ 0], offset);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_bytealign (w[ 4], w[ 5], offset);
      w[62] = hc_bytealign (w[ 3], w[ 4], offset);
      w[61] = hc_bytealign (w[ 2], w[ 3], offset);
      w[60] = hc_bytealign (w[ 1], w[ 2], offset);
      w[59] = hc_bytealign (w[ 0], w[ 1], offset);
      w[58] = hc_bytealign (    0, w[ 0], offset);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_bytealign (w[ 3], w[ 4], offset);
      w[62] = hc_bytealign (w[ 2], w[ 3], offset);
      w[61] = hc_bytealign (w[ 1], w[ 2], offset);
      w[60] = hc_bytealign (w[ 0], w[ 1], offset);
      w[59] = hc_bytealign (    0, w[ 0], offset);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_bytealign (w[ 2], w[ 3], offset);
      w[62] = hc_bytealign (w[ 1], w[ 2], offset);
      w[61] = hc_bytealign (w[ 0], w[ 1], offset);
      w[60] = hc_bytealign (    0, w[ 0], offset);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_bytealign (w[ 1], w[ 2], offset);
      w[62] = hc_bytealign (w[ 0], w[ 1], offset);
      w[61] = hc_bytealign (    0, w[ 0], offset);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_bytealign (w[ 0], w[ 1], offset);
      w[62] = hc_bytealign (    0, w[ 0], offset);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_bytealign (    0, w[ 0], offset);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }

  #pragma unroll
  for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]);

  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_byte_perm (w[62], w[63], selector);
      w[62] = hc_byte_perm (w[61], w[62], selector);
      w[61] = hc_byte_perm (w[60], w[61], selector);
      w[60] = hc_byte_perm (w[59], w[60], selector);
      w[59] = hc_byte_perm (w[58], w[59], selector);
      w[58] = hc_byte_perm (w[57], w[58], selector);
      w[57] = hc_byte_perm (w[56], w[57], selector);
      w[56] = hc_byte_perm (w[55], w[56], selector);
      w[55] = hc_byte_perm (w[54], w[55], selector);
      w[54] = hc_byte_perm (w[53], w[54], selector);
      w[53] = hc_byte_perm (w[52], w[53], selector);
      w[52] = hc_byte_perm (w[51], w[52], selector);
      w[51] = hc_byte_perm (w[50], w[51], selector);
      w[50] = hc_byte_perm (w[49], w[50], selector);
      w[49] = hc_byte_perm (w[48], w[49], selector);
      w[48] = hc_byte_perm (w[47], w[48], selector);
      w[47] = hc_byte_perm (w[46], w[47], selector);
      w[46] = hc_byte_perm (w[45], w[46], selector);
      w[45] = hc_byte_perm (w[44], w[45], selector);
      w[44] = hc_byte_perm (w[43], w[44], selector);
      w[43] = hc_byte_perm (w[42], w[43], selector);
      w[42] = hc_byte_perm (w[41], w[42], selector);
      w[41] = hc_byte_perm (w[40], w[41], selector);
      w[40] = hc_byte_perm (w[39], w[40], selector);
      w[39] = hc_byte_perm (w[38], w[39], selector);
      w[38] = hc_byte_perm (w[37], w[38], selector);
      w[37] = hc_byte_perm (w[36], w[37], selector);
      w[36] = hc_byte_perm (w[35], w[36], selector);
      w[35] = hc_byte_perm (w[34], w[35], selector);
      w[34] = hc_byte_perm (w[33], w[34], selector);
      w[33] = hc_byte_perm (w[32], w[33], selector);
      w[32] = hc_byte_perm (w[31], w[32], selector);
      w[31] = hc_byte_perm (w[30], w[31], selector);
      w[30] = hc_byte_perm (w[29], w[30], selector);
      w[29] = hc_byte_perm (w[28], w[29], selector);
      w[28] = hc_byte_perm (w[27], w[28], selector);
      w[27] = hc_byte_perm (w[26], w[27], selector);
      w[26] = hc_byte_perm (w[25], w[26], selector);
      w[25] = hc_byte_perm (w[24], w[25], selector);
      w[24] = hc_byte_perm (w[23], w[24], selector);
      w[23] = hc_byte_perm (w[22], w[23], selector);
      w[22] = hc_byte_perm (w[21], w[22], selector);
      w[21] = hc_byte_perm (w[20], w[21], selector);
      w[20] = hc_byte_perm (w[19], w[20], selector);
      w[19] = hc_byte_perm (w[18], w[19], selector);
      w[18] = hc_byte_perm (w[17], w[18], selector);
      w[17] = hc_byte_perm (w[16], w[17], selector);
      w[16] = hc_byte_perm (w[15], w[16], selector);
      w[15] = hc_byte_perm (w[14], w[15], selector);
      w[14] = hc_byte_perm (w[13], w[14], selector);
      w[13] = hc_byte_perm (w[12], w[13], selector);
      w[12] = hc_byte_perm (w[11], w[12], selector);
      w[11] = hc_byte_perm (w[10], w[11], selector);
      w[10] = hc_byte_perm (w[ 9], w[10], selector);
      w[ 9] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[ 8] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 7] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 6] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 5] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 4] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 3] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 2] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 1] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 0] = hc_byte_perm (    0, w[ 0], selector);

      break;

    case  1:
      w[63] = hc_byte_perm (w[61], w[62], selector);
      w[62] = hc_byte_perm (w[60], w[61], selector);
      w[61] = hc_byte_perm (w[59], w[60], selector);
      w[60] = hc_byte_perm (w[58], w[59], selector);
      w[59] = hc_byte_perm (w[57], w[58], selector);
      w[58] = hc_byte_perm (w[56], w[57], selector);
      w[57] = hc_byte_perm (w[55], w[56], selector);
      w[56] = hc_byte_perm (w[54], w[55], selector);
      w[55] = hc_byte_perm (w[53], w[54], selector);
      w[54] = hc_byte_perm (w[52], w[53], selector);
      w[53] = hc_byte_perm (w[51], w[52], selector);
      w[52] = hc_byte_perm (w[50], w[51], selector);
      w[51] = hc_byte_perm (w[49], w[50], selector);
      w[50] = hc_byte_perm (w[48], w[49], selector);
      w[49] = hc_byte_perm (w[47], w[48], selector);
      w[48] = hc_byte_perm (w[46], w[47], selector);
      w[47] = hc_byte_perm (w[45], w[46], selector);
      w[46] = hc_byte_perm (w[44], w[45], selector);
      w[45] = hc_byte_perm (w[43], w[44], selector);
      w[44] = hc_byte_perm (w[42], w[43], selector);
      w[43] = hc_byte_perm (w[41], w[42], selector);
      w[42] = hc_byte_perm (w[40], w[41], selector);
      w[41] = hc_byte_perm (w[39], w[40], selector);
      w[40] = hc_byte_perm (w[38], w[39], selector);
      w[39] = hc_byte_perm (w[37], w[38], selector);
      w[38] = hc_byte_perm (w[36], w[37], selector);
      w[37] = hc_byte_perm (w[35], w[36], selector);
      w[36] = hc_byte_perm (w[34], w[35], selector);
      w[35] = hc_byte_perm (w[33], w[34], selector);
      w[34] = hc_byte_perm (w[32], w[33], selector);
      w[33] = hc_byte_perm (w[31], w[32], selector);
      w[32] = hc_byte_perm (w[30], w[31], selector);
      w[31] = hc_byte_perm (w[29], w[30], selector);
      w[30] = hc_byte_perm (w[28], w[29], selector);
      w[29] = hc_byte_perm (w[27], w[28], selector);
      w[28] = hc_byte_perm (w[26], w[27], selector);
      w[27] = hc_byte_perm (w[25], w[26], selector);
      w[26] = hc_byte_perm (w[24], w[25], selector);
      w[25] = hc_byte_perm (w[23], w[24], selector);
      w[24] = hc_byte_perm (w[22], w[23], selector);
      w[23] = hc_byte_perm (w[21], w[22], selector);
      w[22] = hc_byte_perm (w[20], w[21], selector);
      w[21] = hc_byte_perm (w[19], w[20], selector);
      w[20] = hc_byte_perm (w[18], w[19], selector);
      w[19] = hc_byte_perm (w[17], w[18], selector);
      w[18] = hc_byte_perm (w[16], w[17], selector);
      w[17] = hc_byte_perm (w[15], w[16], selector);
      w[16] = hc_byte_perm (w[14], w[15], selector);
      w[15] = hc_byte_perm (w[13], w[14], selector);
      w[14] = hc_byte_perm (w[12], w[13], selector);
      w[13] = hc_byte_perm (w[11], w[12], selector);
      w[12] = hc_byte_perm (w[10], w[11], selector);
      w[11] = hc_byte_perm (w[ 9], w[10], selector);
      w[10] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[ 9] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 8] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 7] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 6] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 5] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 4] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 3] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 2] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 1] = hc_byte_perm (    0, w[ 0], selector);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_byte_perm (w[60], w[61], selector);
      w[62] = hc_byte_perm (w[59], w[60], selector);
      w[61] = hc_byte_perm (w[58], w[59], selector);
      w[60] = hc_byte_perm (w[57], w[58], selector);
      w[59] = hc_byte_perm (w[56], w[57], selector);
      w[58] = hc_byte_perm (w[55], w[56], selector);
      w[57] = hc_byte_perm (w[54], w[55], selector);
      w[56] = hc_byte_perm (w[53], w[54], selector);
      w[55] = hc_byte_perm (w[52], w[53], selector);
      w[54] = hc_byte_perm (w[51], w[52], selector);
      w[53] = hc_byte_perm (w[50], w[51], selector);
      w[52] = hc_byte_perm (w[49], w[50], selector);
      w[51] = hc_byte_perm (w[48], w[49], selector);
      w[50] = hc_byte_perm (w[47], w[48], selector);
      w[49] = hc_byte_perm (w[46], w[47], selector);
      w[48] = hc_byte_perm (w[45], w[46], selector);
      w[47] = hc_byte_perm (w[44], w[45], selector);
      w[46] = hc_byte_perm (w[43], w[44], selector);
      w[45] = hc_byte_perm (w[42], w[43], selector);
      w[44] = hc_byte_perm (w[41], w[42], selector);
      w[43] = hc_byte_perm (w[40], w[41], selector);
      w[42] = hc_byte_perm (w[39], w[40], selector);
      w[41] = hc_byte_perm (w[38], w[39], selector);
      w[40] = hc_byte_perm (w[37], w[38], selector);
      w[39] = hc_byte_perm (w[36], w[37], selector);
      w[38] = hc_byte_perm (w[35], w[36], selector);
      w[37] = hc_byte_perm (w[34], w[35], selector);
      w[36] = hc_byte_perm (w[33], w[34], selector);
      w[35] = hc_byte_perm (w[32], w[33], selector);
      w[34] = hc_byte_perm (w[31], w[32], selector);
      w[33] = hc_byte_perm (w[30], w[31], selector);
      w[32] = hc_byte_perm (w[29], w[30], selector);
      w[31] = hc_byte_perm (w[28], w[29], selector);
      w[30] = hc_byte_perm (w[27], w[28], selector);
      w[29] = hc_byte_perm (w[26], w[27], selector);
      w[28] = hc_byte_perm (w[25], w[26], selector);
      w[27] = hc_byte_perm (w[24], w[25], selector);
      w[26] = hc_byte_perm (w[23], w[24], selector);
      w[25] = hc_byte_perm (w[22], w[23], selector);
      w[24] = hc_byte_perm (w[21], w[22], selector);
      w[23] = hc_byte_perm (w[20], w[21], selector);
      w[22] = hc_byte_perm (w[19], w[20], selector);
      w[21] = hc_byte_perm (w[18], w[19], selector);
      w[20] = hc_byte_perm (w[17], w[18], selector);
      w[19] = hc_byte_perm (w[16], w[17], selector);
      w[18] = hc_byte_perm (w[15], w[16], selector);
      w[17] = hc_byte_perm (w[14], w[15], selector);
      w[16] = hc_byte_perm (w[13], w[14], selector);
      w[15] = hc_byte_perm (w[12], w[13], selector);
      w[14] = hc_byte_perm (w[11], w[12], selector);
      w[13] = hc_byte_perm (w[10], w[11], selector);
      w[12] = hc_byte_perm (w[ 9], w[10], selector);
      w[11] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[10] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 9] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 8] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 7] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 6] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 5] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 4] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 3] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 2] = hc_byte_perm (    0, w[ 0], selector);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_byte_perm (w[59], w[60], selector);
      w[62] = hc_byte_perm (w[58], w[59], selector);
      w[61] = hc_byte_perm (w[57], w[58], selector);
      w[60] = hc_byte_perm (w[56], w[57], selector);
      w[59] = hc_byte_perm (w[55], w[56], selector);
      w[58] = hc_byte_perm (w[54], w[55], selector);
      w[57] = hc_byte_perm (w[53], w[54], selector);
      w[56] = hc_byte_perm (w[52], w[53], selector);
      w[55] = hc_byte_perm (w[51], w[52], selector);
      w[54] = hc_byte_perm (w[50], w[51], selector);
      w[53] = hc_byte_perm (w[49], w[50], selector);
      w[52] = hc_byte_perm (w[48], w[49], selector);
      w[51] = hc_byte_perm (w[47], w[48], selector);
      w[50] = hc_byte_perm (w[46], w[47], selector);
      w[49] = hc_byte_perm (w[45], w[46], selector);
      w[48] = hc_byte_perm (w[44], w[45], selector);
      w[47] = hc_byte_perm (w[43], w[44], selector);
      w[46] = hc_byte_perm (w[42], w[43], selector);
      w[45] = hc_byte_perm (w[41], w[42], selector);
      w[44] = hc_byte_perm (w[40], w[41], selector);
      w[43] = hc_byte_perm (w[39], w[40], selector);
      w[42] = hc_byte_perm (w[38], w[39], selector);
      w[41] = hc_byte_perm (w[37], w[38], selector);
      w[40] = hc_byte_perm (w[36], w[37], selector);
      w[39] = hc_byte_perm (w[35], w[36], selector);
      w[38] = hc_byte_perm (w[34], w[35], selector);
      w[37] = hc_byte_perm (w[33], w[34], selector);
      w[36] = hc_byte_perm (w[32], w[33], selector);
      w[35] = hc_byte_perm (w[31], w[32], selector);
      w[34] = hc_byte_perm (w[30], w[31], selector);
      w[33] = hc_byte_perm (w[29], w[30], selector);
      w[32] = hc_byte_perm (w[28], w[29], selector);
      w[31] = hc_byte_perm (w[27], w[28], selector);
      w[30] = hc_byte_perm (w[26], w[27], selector);
      w[29] = hc_byte_perm (w[25], w[26], selector);
      w[28] = hc_byte_perm (w[24], w[25], selector);
      w[27] = hc_byte_perm (w[23], w[24], selector);
      w[26] = hc_byte_perm (w[22], w[23], selector);
      w[25] = hc_byte_perm (w[21], w[22], selector);
      w[24] = hc_byte_perm (w[20], w[21], selector);
      w[23] = hc_byte_perm (w[19], w[20], selector);
      w[22] = hc_byte_perm (w[18], w[19], selector);
      w[21] = hc_byte_perm (w[17], w[18], selector);
      w[20] = hc_byte_perm (w[16], w[17], selector);
      w[19] = hc_byte_perm (w[15], w[16], selector);
      w[18] = hc_byte_perm (w[14], w[15], selector);
      w[17] = hc_byte_perm (w[13], w[14], selector);
      w[16] = hc_byte_perm (w[12], w[13], selector);
      w[15] = hc_byte_perm (w[11], w[12], selector);
      w[14] = hc_byte_perm (w[10], w[11], selector);
      w[13] = hc_byte_perm (w[ 9], w[10], selector);
      w[12] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[11] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[10] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 9] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 8] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 7] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 6] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 5] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 4] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 3] = hc_byte_perm (    0, w[ 0], selector);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_byte_perm (w[58], w[59], selector);
      w[62] = hc_byte_perm (w[57], w[58], selector);
      w[61] = hc_byte_perm (w[56], w[57], selector);
      w[60] = hc_byte_perm (w[55], w[56], selector);
      w[59] = hc_byte_perm (w[54], w[55], selector);
      w[58] = hc_byte_perm (w[53], w[54], selector);
      w[57] = hc_byte_perm (w[52], w[53], selector);
      w[56] = hc_byte_perm (w[51], w[52], selector);
      w[55] = hc_byte_perm (w[50], w[51], selector);
      w[54] = hc_byte_perm (w[49], w[50], selector);
      w[53] = hc_byte_perm (w[48], w[49], selector);
      w[52] = hc_byte_perm (w[47], w[48], selector);
      w[51] = hc_byte_perm (w[46], w[47], selector);
      w[50] = hc_byte_perm (w[45], w[46], selector);
      w[49] = hc_byte_perm (w[44], w[45], selector);
      w[48] = hc_byte_perm (w[43], w[44], selector);
      w[47] = hc_byte_perm (w[42], w[43], selector);
      w[46] = hc_byte_perm (w[41], w[42], selector);
      w[45] = hc_byte_perm (w[40], w[41], selector);
      w[44] = hc_byte_perm (w[39], w[40], selector);
      w[43] = hc_byte_perm (w[38], w[39], selector);
      w[42] = hc_byte_perm (w[37], w[38], selector);
      w[41] = hc_byte_perm (w[36], w[37], selector);
      w[40] = hc_byte_perm (w[35], w[36], selector);
      w[39] = hc_byte_perm (w[34], w[35], selector);
      w[38] = hc_byte_perm (w[33], w[34], selector);
      w[37] = hc_byte_perm (w[32], w[33], selector);
      w[36] = hc_byte_perm (w[31], w[32], selector);
      w[35] = hc_byte_perm (w[30], w[31], selector);
      w[34] = hc_byte_perm (w[29], w[30], selector);
      w[33] = hc_byte_perm (w[28], w[29], selector);
      w[32] = hc_byte_perm (w[27], w[28], selector);
      w[31] = hc_byte_perm (w[26], w[27], selector);
      w[30] = hc_byte_perm (w[25], w[26], selector);
      w[29] = hc_byte_perm (w[24], w[25], selector);
      w[28] = hc_byte_perm (w[23], w[24], selector);
      w[27] = hc_byte_perm (w[22], w[23], selector);
      w[26] = hc_byte_perm (w[21], w[22], selector);
      w[25] = hc_byte_perm (w[20], w[21], selector);
      w[24] = hc_byte_perm (w[19], w[20], selector);
      w[23] = hc_byte_perm (w[18], w[19], selector);
      w[22] = hc_byte_perm (w[17], w[18], selector);
      w[21] = hc_byte_perm (w[16], w[17], selector);
      w[20] = hc_byte_perm (w[15], w[16], selector);
      w[19] = hc_byte_perm (w[14], w[15], selector);
      w[18] = hc_byte_perm (w[13], w[14], selector);
      w[17] = hc_byte_perm (w[12], w[13], selector);
      w[16] = hc_byte_perm (w[11], w[12], selector);
      w[15] = hc_byte_perm (w[10], w[11], selector);
      w[14] = hc_byte_perm (w[ 9], w[10], selector);
      w[13] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[12] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[11] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[10] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 9] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 8] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 7] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 6] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 5] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 4] = hc_byte_perm (    0, w[ 0], selector);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_byte_perm (w[57], w[58], selector);
      w[62] = hc_byte_perm (w[56], w[57], selector);
      w[61] = hc_byte_perm (w[55], w[56], selector);
      w[60] = hc_byte_perm (w[54], w[55], selector);
      w[59] = hc_byte_perm (w[53], w[54], selector);
      w[58] = hc_byte_perm (w[52], w[53], selector);
      w[57] = hc_byte_perm (w[51], w[52], selector);
      w[56] = hc_byte_perm (w[50], w[51], selector);
      w[55] = hc_byte_perm (w[49], w[50], selector);
      w[54] = hc_byte_perm (w[48], w[49], selector);
      w[53] = hc_byte_perm (w[47], w[48], selector);
      w[52] = hc_byte_perm (w[46], w[47], selector);
      w[51] = hc_byte_perm (w[45], w[46], selector);
      w[50] = hc_byte_perm (w[44], w[45], selector);
      w[49] = hc_byte_perm (w[43], w[44], selector);
      w[48] = hc_byte_perm (w[42], w[43], selector);
      w[47] = hc_byte_perm (w[41], w[42], selector);
      w[46] = hc_byte_perm (w[40], w[41], selector);
      w[45] = hc_byte_perm (w[39], w[40], selector);
      w[44] = hc_byte_perm (w[38], w[39], selector);
      w[43] = hc_byte_perm (w[37], w[38], selector);
      w[42] = hc_byte_perm (w[36], w[37], selector);
      w[41] = hc_byte_perm (w[35], w[36], selector);
      w[40] = hc_byte_perm (w[34], w[35], selector);
      w[39] = hc_byte_perm (w[33], w[34], selector);
      w[38] = hc_byte_perm (w[32], w[33], selector);
      w[37] = hc_byte_perm (w[31], w[32], selector);
      w[36] = hc_byte_perm (w[30], w[31], selector);
      w[35] = hc_byte_perm (w[29], w[30], selector);
      w[34] = hc_byte_perm (w[28], w[29], selector);
      w[33] = hc_byte_perm (w[27], w[28], selector);
      w[32] = hc_byte_perm (w[26], w[27], selector);
      w[31] = hc_byte_perm (w[25], w[26], selector);
      w[30] = hc_byte_perm (w[24], w[25], selector);
      w[29] = hc_byte_perm (w[23], w[24], selector);
      w[28] = hc_byte_perm (w[22], w[23], selector);
      w[27] = hc_byte_perm (w[21], w[22], selector);
      w[26] = hc_byte_perm (w[20], w[21], selector);
      w[25] = hc_byte_perm (w[19], w[20], selector);
      w[24] = hc_byte_perm (w[18], w[19], selector);
      w[23] = hc_byte_perm (w[17], w[18], selector);
      w[22] = hc_byte_perm (w[16], w[17], selector);
      w[21] = hc_byte_perm (w[15], w[16], selector);
      w[20] = hc_byte_perm (w[14], w[15], selector);
      w[19] = hc_byte_perm (w[13], w[14], selector);
      w[18] = hc_byte_perm (w[12], w[13], selector);
      w[17] = hc_byte_perm (w[11], w[12], selector);
      w[16] = hc_byte_perm (w[10], w[11], selector);
      w[15] = hc_byte_perm (w[ 9], w[10], selector);
      w[14] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[13] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[12] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[11] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[10] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 9] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 8] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 7] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 6] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 5] = hc_byte_perm (    0, w[ 0], selector);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_byte_perm (w[56], w[57], selector);
      w[62] = hc_byte_perm (w[55], w[56], selector);
      w[61] = hc_byte_perm (w[54], w[55], selector);
      w[60] = hc_byte_perm (w[53], w[54], selector);
      w[59] = hc_byte_perm (w[52], w[53], selector);
      w[58] = hc_byte_perm (w[51], w[52], selector);
      w[57] = hc_byte_perm (w[50], w[51], selector);
      w[56] = hc_byte_perm (w[49], w[50], selector);
      w[55] = hc_byte_perm (w[48], w[49], selector);
      w[54] = hc_byte_perm (w[47], w[48], selector);
      w[53] = hc_byte_perm (w[46], w[47], selector);
      w[52] = hc_byte_perm (w[45], w[46], selector);
      w[51] = hc_byte_perm (w[44], w[45], selector);
      w[50] = hc_byte_perm (w[43], w[44], selector);
      w[49] = hc_byte_perm (w[42], w[43], selector);
      w[48] = hc_byte_perm (w[41], w[42], selector);
      w[47] = hc_byte_perm (w[40], w[41], selector);
      w[46] = hc_byte_perm (w[39], w[40], selector);
      w[45] = hc_byte_perm (w[38], w[39], selector);
      w[44] = hc_byte_perm (w[37], w[38], selector);
      w[43] = hc_byte_perm (w[36], w[37], selector);
      w[42] = hc_byte_perm (w[35], w[36], selector);
      w[41] = hc_byte_perm (w[34], w[35], selector);
      w[40] = hc_byte_perm (w[33], w[34], selector);
      w[39] = hc_byte_perm (w[32], w[33], selector);
      w[38] = hc_byte_perm (w[31], w[32], selector);
      w[37] = hc_byte_perm (w[30], w[31], selector);
      w[36] = hc_byte_perm (w[29], w[30], selector);
      w[35] = hc_byte_perm (w[28], w[29], selector);
      w[34] = hc_byte_perm (w[27], w[28], selector);
      w[33] = hc_byte_perm (w[26], w[27], selector);
      w[32] = hc_byte_perm (w[25], w[26], selector);
      w[31] = hc_byte_perm (w[24], w[25], selector);
      w[30] = hc_byte_perm (w[23], w[24], selector);
      w[29] = hc_byte_perm (w[22], w[23], selector);
      w[28] = hc_byte_perm (w[21], w[22], selector);
      w[27] = hc_byte_perm (w[20], w[21], selector);
      w[26] = hc_byte_perm (w[19], w[20], selector);
      w[25] = hc_byte_perm (w[18], w[19], selector);
      w[24] = hc_byte_perm (w[17], w[18], selector);
      w[23] = hc_byte_perm (w[16], w[17], selector);
      w[22] = hc_byte_perm (w[15], w[16], selector);
      w[21] = hc_byte_perm (w[14], w[15], selector);
      w[20] = hc_byte_perm (w[13], w[14], selector);
      w[19] = hc_byte_perm (w[12], w[13], selector);
      w[18] = hc_byte_perm (w[11], w[12], selector);
      w[17] = hc_byte_perm (w[10], w[11], selector);
      w[16] = hc_byte_perm (w[ 9], w[10], selector);
      w[15] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[14] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[13] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[12] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[11] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[10] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 9] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 8] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 7] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 6] = hc_byte_perm (    0, w[ 0], selector);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_byte_perm (w[55], w[56], selector);
      w[62] = hc_byte_perm (w[54], w[55], selector);
      w[61] = hc_byte_perm (w[53], w[54], selector);
      w[60] = hc_byte_perm (w[52], w[53], selector);
      w[59] = hc_byte_perm (w[51], w[52], selector);
      w[58] = hc_byte_perm (w[50], w[51], selector);
      w[57] = hc_byte_perm (w[49], w[50], selector);
      w[56] = hc_byte_perm (w[48], w[49], selector);
      w[55] = hc_byte_perm (w[47], w[48], selector);
      w[54] = hc_byte_perm (w[46], w[47], selector);
      w[53] = hc_byte_perm (w[45], w[46], selector);
      w[52] = hc_byte_perm (w[44], w[45], selector);
      w[51] = hc_byte_perm (w[43], w[44], selector);
      w[50] = hc_byte_perm (w[42], w[43], selector);
      w[49] = hc_byte_perm (w[41], w[42], selector);
      w[48] = hc_byte_perm (w[40], w[41], selector);
      w[47] = hc_byte_perm (w[39], w[40], selector);
      w[46] = hc_byte_perm (w[38], w[39], selector);
      w[45] = hc_byte_perm (w[37], w[38], selector);
      w[44] = hc_byte_perm (w[36], w[37], selector);
      w[43] = hc_byte_perm (w[35], w[36], selector);
      w[42] = hc_byte_perm (w[34], w[35], selector);
      w[41] = hc_byte_perm (w[33], w[34], selector);
      w[40] = hc_byte_perm (w[32], w[33], selector);
      w[39] = hc_byte_perm (w[31], w[32], selector);
      w[38] = hc_byte_perm (w[30], w[31], selector);
      w[37] = hc_byte_perm (w[29], w[30], selector);
      w[36] = hc_byte_perm (w[28], w[29], selector);
      w[35] = hc_byte_perm (w[27], w[28], selector);
      w[34] = hc_byte_perm (w[26], w[27], selector);
      w[33] = hc_byte_perm (w[25], w[26], selector);
      w[32] = hc_byte_perm (w[24], w[25], selector);
      w[31] = hc_byte_perm (w[23], w[24], selector);
      w[30] = hc_byte_perm (w[22], w[23], selector);
      w[29] = hc_byte_perm (w[21], w[22], selector);
      w[28] = hc_byte_perm (w[20], w[21], selector);
      w[27] = hc_byte_perm (w[19], w[20], selector);
      w[26] = hc_byte_perm (w[18], w[19], selector);
      w[25] = hc_byte_perm (w[17], w[18], selector);
      w[24] = hc_byte_perm (w[16], w[17], selector);
      w[23] = hc_byte_perm (w[15], w[16], selector);
      w[22] = hc_byte_perm (w[14], w[15], selector);
      w[21] = hc_byte_perm (w[13], w[14], selector);
      w[20] = hc_byte_perm (w[12], w[13], selector);
      w[19] = hc_byte_perm (w[11], w[12], selector);
      w[18] = hc_byte_perm (w[10], w[11], selector);
      w[17] = hc_byte_perm (w[ 9], w[10], selector);
      w[16] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[15] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[14] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[13] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[12] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[11] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[10] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 9] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 8] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 7] = hc_byte_perm (    0, w[ 0], selector);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_byte_perm (w[54], w[55], selector);
      w[62] = hc_byte_perm (w[53], w[54], selector);
      w[61] = hc_byte_perm (w[52], w[53], selector);
      w[60] = hc_byte_perm (w[51], w[52], selector);
      w[59] = hc_byte_perm (w[50], w[51], selector);
      w[58] = hc_byte_perm (w[49], w[50], selector);
      w[57] = hc_byte_perm (w[48], w[49], selector);
      w[56] = hc_byte_perm (w[47], w[48], selector);
      w[55] = hc_byte_perm (w[46], w[47], selector);
      w[54] = hc_byte_perm (w[45], w[46], selector);
      w[53] = hc_byte_perm (w[44], w[45], selector);
      w[52] = hc_byte_perm (w[43], w[44], selector);
      w[51] = hc_byte_perm (w[42], w[43], selector);
      w[50] = hc_byte_perm (w[41], w[42], selector);
      w[49] = hc_byte_perm (w[40], w[41], selector);
      w[48] = hc_byte_perm (w[39], w[40], selector);
      w[47] = hc_byte_perm (w[38], w[39], selector);
      w[46] = hc_byte_perm (w[37], w[38], selector);
      w[45] = hc_byte_perm (w[36], w[37], selector);
      w[44] = hc_byte_perm (w[35], w[36], selector);
      w[43] = hc_byte_perm (w[34], w[35], selector);
      w[42] = hc_byte_perm (w[33], w[34], selector);
      w[41] = hc_byte_perm (w[32], w[33], selector);
      w[40] = hc_byte_perm (w[31], w[32], selector);
      w[39] = hc_byte_perm (w[30], w[31], selector);
      w[38] = hc_byte_perm (w[29], w[30], selector);
      w[37] = hc_byte_perm (w[28], w[29], selector);
      w[36] = hc_byte_perm (w[27], w[28], selector);
      w[35] = hc_byte_perm (w[26], w[27], selector);
      w[34] = hc_byte_perm (w[25], w[26], selector);
      w[33] = hc_byte_perm (w[24], w[25], selector);
      w[32] = hc_byte_perm (w[23], w[24], selector);
      w[31] = hc_byte_perm (w[22], w[23], selector);
      w[30] = hc_byte_perm (w[21], w[22], selector);
      w[29] = hc_byte_perm (w[20], w[21], selector);
      w[28] = hc_byte_perm (w[19], w[20], selector);
      w[27] = hc_byte_perm (w[18], w[19], selector);
      w[26] = hc_byte_perm (w[17], w[18], selector);
      w[25] = hc_byte_perm (w[16], w[17], selector);
      w[24] = hc_byte_perm (w[15], w[16], selector);
      w[23] = hc_byte_perm (w[14], w[15], selector);
      w[22] = hc_byte_perm (w[13], w[14], selector);
      w[21] = hc_byte_perm (w[12], w[13], selector);
      w[20] = hc_byte_perm (w[11], w[12], selector);
      w[19] = hc_byte_perm (w[10], w[11], selector);
      w[18] = hc_byte_perm (w[ 9], w[10], selector);
      w[17] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[16] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[15] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[14] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[13] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[12] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[11] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[10] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 9] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 8] = hc_byte_perm (    0, w[ 0], selector);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_byte_perm (w[53], w[54], selector);
      w[62] = hc_byte_perm (w[52], w[53], selector);
      w[61] = hc_byte_perm (w[51], w[52], selector);
      w[60] = hc_byte_perm (w[50], w[51], selector);
      w[59] = hc_byte_perm (w[49], w[50], selector);
      w[58] = hc_byte_perm (w[48], w[49], selector);
      w[57] = hc_byte_perm (w[47], w[48], selector);
      w[56] = hc_byte_perm (w[46], w[47], selector);
      w[55] = hc_byte_perm (w[45], w[46], selector);
      w[54] = hc_byte_perm (w[44], w[45], selector);
      w[53] = hc_byte_perm (w[43], w[44], selector);
      w[52] = hc_byte_perm (w[42], w[43], selector);
      w[51] = hc_byte_perm (w[41], w[42], selector);
      w[50] = hc_byte_perm (w[40], w[41], selector);
      w[49] = hc_byte_perm (w[39], w[40], selector);
      w[48] = hc_byte_perm (w[38], w[39], selector);
      w[47] = hc_byte_perm (w[37], w[38], selector);
      w[46] = hc_byte_perm (w[36], w[37], selector);
      w[45] = hc_byte_perm (w[35], w[36], selector);
      w[44] = hc_byte_perm (w[34], w[35], selector);
      w[43] = hc_byte_perm (w[33], w[34], selector);
      w[42] = hc_byte_perm (w[32], w[33], selector);
      w[41] = hc_byte_perm (w[31], w[32], selector);
      w[40] = hc_byte_perm (w[30], w[31], selector);
      w[39] = hc_byte_perm (w[29], w[30], selector);
      w[38] = hc_byte_perm (w[28], w[29], selector);
      w[37] = hc_byte_perm (w[27], w[28], selector);
      w[36] = hc_byte_perm (w[26], w[27], selector);
      w[35] = hc_byte_perm (w[25], w[26], selector);
      w[34] = hc_byte_perm (w[24], w[25], selector);
      w[33] = hc_byte_perm (w[23], w[24], selector);
      w[32] = hc_byte_perm (w[22], w[23], selector);
      w[31] = hc_byte_perm (w[21], w[22], selector);
      w[30] = hc_byte_perm (w[20], w[21], selector);
      w[29] = hc_byte_perm (w[19], w[20], selector);
      w[28] = hc_byte_perm (w[18], w[19], selector);
      w[27] = hc_byte_perm (w[17], w[18], selector);
      w[26] = hc_byte_perm (w[16], w[17], selector);
      w[25] = hc_byte_perm (w[15], w[16], selector);
      w[24] = hc_byte_perm (w[14], w[15], selector);
      w[23] = hc_byte_perm (w[13], w[14], selector);
      w[22] = hc_byte_perm (w[12], w[13], selector);
      w[21] = hc_byte_perm (w[11], w[12], selector);
      w[20] = hc_byte_perm (w[10], w[11], selector);
      w[19] = hc_byte_perm (w[ 9], w[10], selector);
      w[18] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[17] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[16] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[15] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[14] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[13] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[12] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[11] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[10] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 9] = hc_byte_perm (    0, w[ 0], selector);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_byte_perm (w[52], w[53], selector);
      w[62] = hc_byte_perm (w[51], w[52], selector);
      w[61] = hc_byte_perm (w[50], w[51], selector);
      w[60] = hc_byte_perm (w[49], w[50], selector);
      w[59] = hc_byte_perm (w[48], w[49], selector);
      w[58] = hc_byte_perm (w[47], w[48], selector);
      w[57] = hc_byte_perm (w[46], w[47], selector);
      w[56] = hc_byte_perm (w[45], w[46], selector);
      w[55] = hc_byte_perm (w[44], w[45], selector);
      w[54] = hc_byte_perm (w[43], w[44], selector);
      w[53] = hc_byte_perm (w[42], w[43], selector);
      w[52] = hc_byte_perm (w[41], w[42], selector);
      w[51] = hc_byte_perm (w[40], w[41], selector);
      w[50] = hc_byte_perm (w[39], w[40], selector);
      w[49] = hc_byte_perm (w[38], w[39], selector);
      w[48] = hc_byte_perm (w[37], w[38], selector);
      w[47] = hc_byte_perm (w[36], w[37], selector);
      w[46] = hc_byte_perm (w[35], w[36], selector);
      w[45] = hc_byte_perm (w[34], w[35], selector);
      w[44] = hc_byte_perm (w[33], w[34], selector);
      w[43] = hc_byte_perm (w[32], w[33], selector);
      w[42] = hc_byte_perm (w[31], w[32], selector);
      w[41] = hc_byte_perm (w[30], w[31], selector);
      w[40] = hc_byte_perm (w[29], w[30], selector);
      w[39] = hc_byte_perm (w[28], w[29], selector);
      w[38] = hc_byte_perm (w[27], w[28], selector);
      w[37] = hc_byte_perm (w[26], w[27], selector);
      w[36] = hc_byte_perm (w[25], w[26], selector);
      w[35] = hc_byte_perm (w[24], w[25], selector);
      w[34] = hc_byte_perm (w[23], w[24], selector);
      w[33] = hc_byte_perm (w[22], w[23], selector);
      w[32] = hc_byte_perm (w[21], w[22], selector);
      w[31] = hc_byte_perm (w[20], w[21], selector);
      w[30] = hc_byte_perm (w[19], w[20], selector);
      w[29] = hc_byte_perm (w[18], w[19], selector);
      w[28] = hc_byte_perm (w[17], w[18], selector);
      w[27] = hc_byte_perm (w[16], w[17], selector);
      w[26] = hc_byte_perm (w[15], w[16], selector);
      w[25] = hc_byte_perm (w[14], w[15], selector);
      w[24] = hc_byte_perm (w[13], w[14], selector);
      w[23] = hc_byte_perm (w[12], w[13], selector);
      w[22] = hc_byte_perm (w[11], w[12], selector);
      w[21] = hc_byte_perm (w[10], w[11], selector);
      w[20] = hc_byte_perm (w[ 9], w[10], selector);
      w[19] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[18] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[17] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[16] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[15] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[14] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[13] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[12] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[11] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[10] = hc_byte_perm (    0, w[ 0], selector);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_byte_perm (w[51], w[52], selector);
      w[62] = hc_byte_perm (w[50], w[51], selector);
      w[61] = hc_byte_perm (w[49], w[50], selector);
      w[60] = hc_byte_perm (w[48], w[49], selector);
      w[59] = hc_byte_perm (w[47], w[48], selector);
      w[58] = hc_byte_perm (w[46], w[47], selector);
      w[57] = hc_byte_perm (w[45], w[46], selector);
      w[56] = hc_byte_perm (w[44], w[45], selector);
      w[55] = hc_byte_perm (w[43], w[44], selector);
      w[54] = hc_byte_perm (w[42], w[43], selector);
      w[53] = hc_byte_perm (w[41], w[42], selector);
      w[52] = hc_byte_perm (w[40], w[41], selector);
      w[51] = hc_byte_perm (w[39], w[40], selector);
      w[50] = hc_byte_perm (w[38], w[39], selector);
      w[49] = hc_byte_perm (w[37], w[38], selector);
      w[48] = hc_byte_perm (w[36], w[37], selector);
      w[47] = hc_byte_perm (w[35], w[36], selector);
      w[46] = hc_byte_perm (w[34], w[35], selector);
      w[45] = hc_byte_perm (w[33], w[34], selector);
      w[44] = hc_byte_perm (w[32], w[33], selector);
      w[43] = hc_byte_perm (w[31], w[32], selector);
      w[42] = hc_byte_perm (w[30], w[31], selector);
      w[41] = hc_byte_perm (w[29], w[30], selector);
      w[40] = hc_byte_perm (w[28], w[29], selector);
      w[39] = hc_byte_perm (w[27], w[28], selector);
      w[38] = hc_byte_perm (w[26], w[27], selector);
      w[37] = hc_byte_perm (w[25], w[26], selector);
      w[36] = hc_byte_perm (w[24], w[25], selector);
      w[35] = hc_byte_perm (w[23], w[24], selector);
      w[34] = hc_byte_perm (w[22], w[23], selector);
      w[33] = hc_byte_perm (w[21], w[22], selector);
      w[32] = hc_byte_perm (w[20], w[21], selector);
      w[31] = hc_byte_perm (w[19], w[20], selector);
      w[30] = hc_byte_perm (w[18], w[19], selector);
      w[29] = hc_byte_perm (w[17], w[18], selector);
      w[28] = hc_byte_perm (w[16], w[17], selector);
      w[27] = hc_byte_perm (w[15], w[16], selector);
      w[26] = hc_byte_perm (w[14], w[15], selector);
      w[25] = hc_byte_perm (w[13], w[14], selector);
      w[24] = hc_byte_perm (w[12], w[13], selector);
      w[23] = hc_byte_perm (w[11], w[12], selector);
      w[22] = hc_byte_perm (w[10], w[11], selector);
      w[21] = hc_byte_perm (w[ 9], w[10], selector);
      w[20] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[19] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[18] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[17] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[16] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[15] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[14] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[13] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[12] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[11] = hc_byte_perm (    0, w[ 0], selector);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_byte_perm (w[50], w[51], selector);
      w[62] = hc_byte_perm (w[49], w[50], selector);
      w[61] = hc_byte_perm (w[48], w[49], selector);
      w[60] = hc_byte_perm (w[47], w[48], selector);
      w[59] = hc_byte_perm (w[46], w[47], selector);
      w[58] = hc_byte_perm (w[45], w[46], selector);
      w[57] = hc_byte_perm (w[44], w[45], selector);
      w[56] = hc_byte_perm (w[43], w[44], selector);
      w[55] = hc_byte_perm (w[42], w[43], selector);
      w[54] = hc_byte_perm (w[41], w[42], selector);
      w[53] = hc_byte_perm (w[40], w[41], selector);
      w[52] = hc_byte_perm (w[39], w[40], selector);
      w[51] = hc_byte_perm (w[38], w[39], selector);
      w[50] = hc_byte_perm (w[37], w[38], selector);
      w[49] = hc_byte_perm (w[36], w[37], selector);
      w[48] = hc_byte_perm (w[35], w[36], selector);
      w[47] = hc_byte_perm (w[34], w[35], selector);
      w[46] = hc_byte_perm (w[33], w[34], selector);
      w[45] = hc_byte_perm (w[32], w[33], selector);
      w[44] = hc_byte_perm (w[31], w[32], selector);
      w[43] = hc_byte_perm (w[30], w[31], selector);
      w[42] = hc_byte_perm (w[29], w[30], selector);
      w[41] = hc_byte_perm (w[28], w[29], selector);
      w[40] = hc_byte_perm (w[27], w[28], selector);
      w[39] = hc_byte_perm (w[26], w[27], selector);
      w[38] = hc_byte_perm (w[25], w[26], selector);
      w[37] = hc_byte_perm (w[24], w[25], selector);
      w[36] = hc_byte_perm (w[23], w[24], selector);
      w[35] = hc_byte_perm (w[22], w[23], selector);
      w[34] = hc_byte_perm (w[21], w[22], selector);
      w[33] = hc_byte_perm (w[20], w[21], selector);
      w[32] = hc_byte_perm (w[19], w[20], selector);
      w[31] = hc_byte_perm (w[18], w[19], selector);
      w[30] = hc_byte_perm (w[17], w[18], selector);
      w[29] = hc_byte_perm (w[16], w[17], selector);
      w[28] = hc_byte_perm (w[15], w[16], selector);
      w[27] = hc_byte_perm (w[14], w[15], selector);
      w[26] = hc_byte_perm (w[13], w[14], selector);
      w[25] = hc_byte_perm (w[12], w[13], selector);
      w[24] = hc_byte_perm (w[11], w[12], selector);
      w[23] = hc_byte_perm (w[10], w[11], selector);
      w[22] = hc_byte_perm (w[ 9], w[10], selector);
      w[21] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[20] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[19] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[18] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[17] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[16] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[15] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[14] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[13] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[12] = hc_byte_perm (    0, w[ 0], selector);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_byte_perm (w[49], w[50], selector);
      w[62] = hc_byte_perm (w[48], w[49], selector);
      w[61] = hc_byte_perm (w[47], w[48], selector);
      w[60] = hc_byte_perm (w[46], w[47], selector);
      w[59] = hc_byte_perm (w[45], w[46], selector);
      w[58] = hc_byte_perm (w[44], w[45], selector);
      w[57] = hc_byte_perm (w[43], w[44], selector);
      w[56] = hc_byte_perm (w[42], w[43], selector);
      w[55] = hc_byte_perm (w[41], w[42], selector);
      w[54] = hc_byte_perm (w[40], w[41], selector);
      w[53] = hc_byte_perm (w[39], w[40], selector);
      w[52] = hc_byte_perm (w[38], w[39], selector);
      w[51] = hc_byte_perm (w[37], w[38], selector);
      w[50] = hc_byte_perm (w[36], w[37], selector);
      w[49] = hc_byte_perm (w[35], w[36], selector);
      w[48] = hc_byte_perm (w[34], w[35], selector);
      w[47] = hc_byte_perm (w[33], w[34], selector);
      w[46] = hc_byte_perm (w[32], w[33], selector);
      w[45] = hc_byte_perm (w[31], w[32], selector);
      w[44] = hc_byte_perm (w[30], w[31], selector);
      w[43] = hc_byte_perm (w[29], w[30], selector);
      w[42] = hc_byte_perm (w[28], w[29], selector);
      w[41] = hc_byte_perm (w[27], w[28], selector);
      w[40] = hc_byte_perm (w[26], w[27], selector);
      w[39] = hc_byte_perm (w[25], w[26], selector);
      w[38] = hc_byte_perm (w[24], w[25], selector);
      w[37] = hc_byte_perm (w[23], w[24], selector);
      w[36] = hc_byte_perm (w[22], w[23], selector);
      w[35] = hc_byte_perm (w[21], w[22], selector);
      w[34] = hc_byte_perm (w[20], w[21], selector);
      w[33] = hc_byte_perm (w[19], w[20], selector);
      w[32] = hc_byte_perm (w[18], w[19], selector);
      w[31] = hc_byte_perm (w[17], w[18], selector);
      w[30] = hc_byte_perm (w[16], w[17], selector);
      w[29] = hc_byte_perm (w[15], w[16], selector);
      w[28] = hc_byte_perm (w[14], w[15], selector);
      w[27] = hc_byte_perm (w[13], w[14], selector);
      w[26] = hc_byte_perm (w[12], w[13], selector);
      w[25] = hc_byte_perm (w[11], w[12], selector);
      w[24] = hc_byte_perm (w[10], w[11], selector);
      w[23] = hc_byte_perm (w[ 9], w[10], selector);
      w[22] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[21] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[20] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[19] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[18] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[17] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[16] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[15] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[14] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[13] = hc_byte_perm (    0, w[ 0], selector);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_byte_perm (w[48], w[49], selector);
      w[62] = hc_byte_perm (w[47], w[48], selector);
      w[61] = hc_byte_perm (w[46], w[47], selector);
      w[60] = hc_byte_perm (w[45], w[46], selector);
      w[59] = hc_byte_perm (w[44], w[45], selector);
      w[58] = hc_byte_perm (w[43], w[44], selector);
      w[57] = hc_byte_perm (w[42], w[43], selector);
      w[56] = hc_byte_perm (w[41], w[42], selector);
      w[55] = hc_byte_perm (w[40], w[41], selector);
      w[54] = hc_byte_perm (w[39], w[40], selector);
      w[53] = hc_byte_perm (w[38], w[39], selector);
      w[52] = hc_byte_perm (w[37], w[38], selector);
      w[51] = hc_byte_perm (w[36], w[37], selector);
      w[50] = hc_byte_perm (w[35], w[36], selector);
      w[49] = hc_byte_perm (w[34], w[35], selector);
      w[48] = hc_byte_perm (w[33], w[34], selector);
      w[47] = hc_byte_perm (w[32], w[33], selector);
      w[46] = hc_byte_perm (w[31], w[32], selector);
      w[45] = hc_byte_perm (w[30], w[31], selector);
      w[44] = hc_byte_perm (w[29], w[30], selector);
      w[43] = hc_byte_perm (w[28], w[29], selector);
      w[42] = hc_byte_perm (w[27], w[28], selector);
      w[41] = hc_byte_perm (w[26], w[27], selector);
      w[40] = hc_byte_perm (w[25], w[26], selector);
      w[39] = hc_byte_perm (w[24], w[25], selector);
      w[38] = hc_byte_perm (w[23], w[24], selector);
      w[37] = hc_byte_perm (w[22], w[23], selector);
      w[36] = hc_byte_perm (w[21], w[22], selector);
      w[35] = hc_byte_perm (w[20], w[21], selector);
      w[34] = hc_byte_perm (w[19], w[20], selector);
      w[33] = hc_byte_perm (w[18], w[19], selector);
      w[32] = hc_byte_perm (w[17], w[18], selector);
      w[31] = hc_byte_perm (w[16], w[17], selector);
      w[30] = hc_byte_perm (w[15], w[16], selector);
      w[29] = hc_byte_perm (w[14], w[15], selector);
      w[28] = hc_byte_perm (w[13], w[14], selector);
      w[27] = hc_byte_perm (w[12], w[13], selector);
      w[26] = hc_byte_perm (w[11], w[12], selector);
      w[25] = hc_byte_perm (w[10], w[11], selector);
      w[24] = hc_byte_perm (w[ 9], w[10], selector);
      w[23] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[22] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[21] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[20] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[19] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[18] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[17] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[16] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[15] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[14] = hc_byte_perm (    0, w[ 0], selector);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_byte_perm (w[47], w[48], selector);
      w[62] = hc_byte_perm (w[46], w[47], selector);
      w[61] = hc_byte_perm (w[45], w[46], selector);
      w[60] = hc_byte_perm (w[44], w[45], selector);
      w[59] = hc_byte_perm (w[43], w[44], selector);
      w[58] = hc_byte_perm (w[42], w[43], selector);
      w[57] = hc_byte_perm (w[41], w[42], selector);
      w[56] = hc_byte_perm (w[40], w[41], selector);
      w[55] = hc_byte_perm (w[39], w[40], selector);
      w[54] = hc_byte_perm (w[38], w[39], selector);
      w[53] = hc_byte_perm (w[37], w[38], selector);
      w[52] = hc_byte_perm (w[36], w[37], selector);
      w[51] = hc_byte_perm (w[35], w[36], selector);
      w[50] = hc_byte_perm (w[34], w[35], selector);
      w[49] = hc_byte_perm (w[33], w[34], selector);
      w[48] = hc_byte_perm (w[32], w[33], selector);
      w[47] = hc_byte_perm (w[31], w[32], selector);
      w[46] = hc_byte_perm (w[30], w[31], selector);
      w[45] = hc_byte_perm (w[29], w[30], selector);
      w[44] = hc_byte_perm (w[28], w[29], selector);
      w[43] = hc_byte_perm (w[27], w[28], selector);
      w[42] = hc_byte_perm (w[26], w[27], selector);
      w[41] = hc_byte_perm (w[25], w[26], selector);
      w[40] = hc_byte_perm (w[24], w[25], selector);
      w[39] = hc_byte_perm (w[23], w[24], selector);
      w[38] = hc_byte_perm (w[22], w[23], selector);
      w[37] = hc_byte_perm (w[21], w[22], selector);
      w[36] = hc_byte_perm (w[20], w[21], selector);
      w[35] = hc_byte_perm (w[19], w[20], selector);
      w[34] = hc_byte_perm (w[18], w[19], selector);
      w[33] = hc_byte_perm (w[17], w[18], selector);
      w[32] = hc_byte_perm (w[16], w[17], selector);
      w[31] = hc_byte_perm (w[15], w[16], selector);
      w[30] = hc_byte_perm (w[14], w[15], selector);
      w[29] = hc_byte_perm (w[13], w[14], selector);
      w[28] = hc_byte_perm (w[12], w[13], selector);
      w[27] = hc_byte_perm (w[11], w[12], selector);
      w[26] = hc_byte_perm (w[10], w[11], selector);
      w[25] = hc_byte_perm (w[ 9], w[10], selector);
      w[24] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[23] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[22] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[21] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[20] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[19] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[18] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[17] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[16] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[15] = hc_byte_perm (    0, w[ 0], selector);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_byte_perm (w[46], w[47], selector);
      w[62] = hc_byte_perm (w[45], w[46], selector);
      w[61] = hc_byte_perm (w[44], w[45], selector);
      w[60] = hc_byte_perm (w[43], w[44], selector);
      w[59] = hc_byte_perm (w[42], w[43], selector);
      w[58] = hc_byte_perm (w[41], w[42], selector);
      w[57] = hc_byte_perm (w[40], w[41], selector);
      w[56] = hc_byte_perm (w[39], w[40], selector);
      w[55] = hc_byte_perm (w[38], w[39], selector);
      w[54] = hc_byte_perm (w[37], w[38], selector);
      w[53] = hc_byte_perm (w[36], w[37], selector);
      w[52] = hc_byte_perm (w[35], w[36], selector);
      w[51] = hc_byte_perm (w[34], w[35], selector);
      w[50] = hc_byte_perm (w[33], w[34], selector);
      w[49] = hc_byte_perm (w[32], w[33], selector);
      w[48] = hc_byte_perm (w[31], w[32], selector);
      w[47] = hc_byte_perm (w[30], w[31], selector);
      w[46] = hc_byte_perm (w[29], w[30], selector);
      w[45] = hc_byte_perm (w[28], w[29], selector);
      w[44] = hc_byte_perm (w[27], w[28], selector);
      w[43] = hc_byte_perm (w[26], w[27], selector);
      w[42] = hc_byte_perm (w[25], w[26], selector);
      w[41] = hc_byte_perm (w[24], w[25], selector);
      w[40] = hc_byte_perm (w[23], w[24], selector);
      w[39] = hc_byte_perm (w[22], w[23], selector);
      w[38] = hc_byte_perm (w[21], w[22], selector);
      w[37] = hc_byte_perm (w[20], w[21], selector);
      w[36] = hc_byte_perm (w[19], w[20], selector);
      w[35] = hc_byte_perm (w[18], w[19], selector);
      w[34] = hc_byte_perm (w[17], w[18], selector);
      w[33] = hc_byte_perm (w[16], w[17], selector);
      w[32] = hc_byte_perm (w[15], w[16], selector);
      w[31] = hc_byte_perm (w[14], w[15], selector);
      w[30] = hc_byte_perm (w[13], w[14], selector);
      w[29] = hc_byte_perm (w[12], w[13], selector);
      w[28] = hc_byte_perm (w[11], w[12], selector);
      w[27] = hc_byte_perm (w[10], w[11], selector);
      w[26] = hc_byte_perm (w[ 9], w[10], selector);
      w[25] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[24] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[23] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[22] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[21] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[20] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[19] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[18] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[17] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[16] = hc_byte_perm (    0, w[ 0], selector);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_byte_perm (w[45], w[46], selector);
      w[62] = hc_byte_perm (w[44], w[45], selector);
      w[61] = hc_byte_perm (w[43], w[44], selector);
      w[60] = hc_byte_perm (w[42], w[43], selector);
      w[59] = hc_byte_perm (w[41], w[42], selector);
      w[58] = hc_byte_perm (w[40], w[41], selector);
      w[57] = hc_byte_perm (w[39], w[40], selector);
      w[56] = hc_byte_perm (w[38], w[39], selector);
      w[55] = hc_byte_perm (w[37], w[38], selector);
      w[54] = hc_byte_perm (w[36], w[37], selector);
      w[53] = hc_byte_perm (w[35], w[36], selector);
      w[52] = hc_byte_perm (w[34], w[35], selector);
      w[51] = hc_byte_perm (w[33], w[34], selector);
      w[50] = hc_byte_perm (w[32], w[33], selector);
      w[49] = hc_byte_perm (w[31], w[32], selector);
      w[48] = hc_byte_perm (w[30], w[31], selector);
      w[47] = hc_byte_perm (w[29], w[30], selector);
      w[46] = hc_byte_perm (w[28], w[29], selector);
      w[45] = hc_byte_perm (w[27], w[28], selector);
      w[44] = hc_byte_perm (w[26], w[27], selector);
      w[43] = hc_byte_perm (w[25], w[26], selector);
      w[42] = hc_byte_perm (w[24], w[25], selector);
      w[41] = hc_byte_perm (w[23], w[24], selector);
      w[40] = hc_byte_perm (w[22], w[23], selector);
      w[39] = hc_byte_perm (w[21], w[22], selector);
      w[38] = hc_byte_perm (w[20], w[21], selector);
      w[37] = hc_byte_perm (w[19], w[20], selector);
      w[36] = hc_byte_perm (w[18], w[19], selector);
      w[35] = hc_byte_perm (w[17], w[18], selector);
      w[34] = hc_byte_perm (w[16], w[17], selector);
      w[33] = hc_byte_perm (w[15], w[16], selector);
      w[32] = hc_byte_perm (w[14], w[15], selector);
      w[31] = hc_byte_perm (w[13], w[14], selector);
      w[30] = hc_byte_perm (w[12], w[13], selector);
      w[29] = hc_byte_perm (w[11], w[12], selector);
      w[28] = hc_byte_perm (w[10], w[11], selector);
      w[27] = hc_byte_perm (w[ 9], w[10], selector);
      w[26] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[25] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[24] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[23] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[22] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[21] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[20] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[19] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[18] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[17] = hc_byte_perm (    0, w[ 0], selector);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_byte_perm (w[44], w[45], selector);
      w[62] = hc_byte_perm (w[43], w[44], selector);
      w[61] = hc_byte_perm (w[42], w[43], selector);
      w[60] = hc_byte_perm (w[41], w[42], selector);
      w[59] = hc_byte_perm (w[40], w[41], selector);
      w[58] = hc_byte_perm (w[39], w[40], selector);
      w[57] = hc_byte_perm (w[38], w[39], selector);
      w[56] = hc_byte_perm (w[37], w[38], selector);
      w[55] = hc_byte_perm (w[36], w[37], selector);
      w[54] = hc_byte_perm (w[35], w[36], selector);
      w[53] = hc_byte_perm (w[34], w[35], selector);
      w[52] = hc_byte_perm (w[33], w[34], selector);
      w[51] = hc_byte_perm (w[32], w[33], selector);
      w[50] = hc_byte_perm (w[31], w[32], selector);
      w[49] = hc_byte_perm (w[30], w[31], selector);
      w[48] = hc_byte_perm (w[29], w[30], selector);
      w[47] = hc_byte_perm (w[28], w[29], selector);
      w[46] = hc_byte_perm (w[27], w[28], selector);
      w[45] = hc_byte_perm (w[26], w[27], selector);
      w[44] = hc_byte_perm (w[25], w[26], selector);
      w[43] = hc_byte_perm (w[24], w[25], selector);
      w[42] = hc_byte_perm (w[23], w[24], selector);
      w[41] = hc_byte_perm (w[22], w[23], selector);
      w[40] = hc_byte_perm (w[21], w[22], selector);
      w[39] = hc_byte_perm (w[20], w[21], selector);
      w[38] = hc_byte_perm (w[19], w[20], selector);
      w[37] = hc_byte_perm (w[18], w[19], selector);
      w[36] = hc_byte_perm (w[17], w[18], selector);
      w[35] = hc_byte_perm (w[16], w[17], selector);
      w[34] = hc_byte_perm (w[15], w[16], selector);
      w[33] = hc_byte_perm (w[14], w[15], selector);
      w[32] = hc_byte_perm (w[13], w[14], selector);
      w[31] = hc_byte_perm (w[12], w[13], selector);
      w[30] = hc_byte_perm (w[11], w[12], selector);
      w[29] = hc_byte_perm (w[10], w[11], selector);
      w[28] = hc_byte_perm (w[ 9], w[10], selector);
      w[27] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[26] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[25] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[24] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[23] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[22] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[21] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[20] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[19] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[18] = hc_byte_perm (    0, w[ 0], selector);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_byte_perm (w[43], w[44], selector);
      w[62] = hc_byte_perm (w[42], w[43], selector);
      w[61] = hc_byte_perm (w[41], w[42], selector);
      w[60] = hc_byte_perm (w[40], w[41], selector);
      w[59] = hc_byte_perm (w[39], w[40], selector);
      w[58] = hc_byte_perm (w[38], w[39], selector);
      w[57] = hc_byte_perm (w[37], w[38], selector);
      w[56] = hc_byte_perm (w[36], w[37], selector);
      w[55] = hc_byte_perm (w[35], w[36], selector);
      w[54] = hc_byte_perm (w[34], w[35], selector);
      w[53] = hc_byte_perm (w[33], w[34], selector);
      w[52] = hc_byte_perm (w[32], w[33], selector);
      w[51] = hc_byte_perm (w[31], w[32], selector);
      w[50] = hc_byte_perm (w[30], w[31], selector);
      w[49] = hc_byte_perm (w[29], w[30], selector);
      w[48] = hc_byte_perm (w[28], w[29], selector);
      w[47] = hc_byte_perm (w[27], w[28], selector);
      w[46] = hc_byte_perm (w[26], w[27], selector);
      w[45] = hc_byte_perm (w[25], w[26], selector);
      w[44] = hc_byte_perm (w[24], w[25], selector);
      w[43] = hc_byte_perm (w[23], w[24], selector);
      w[42] = hc_byte_perm (w[22], w[23], selector);
      w[41] = hc_byte_perm (w[21], w[22], selector);
      w[40] = hc_byte_perm (w[20], w[21], selector);
      w[39] = hc_byte_perm (w[19], w[20], selector);
      w[38] = hc_byte_perm (w[18], w[19], selector);
      w[37] = hc_byte_perm (w[17], w[18], selector);
      w[36] = hc_byte_perm (w[16], w[17], selector);
      w[35] = hc_byte_perm (w[15], w[16], selector);
      w[34] = hc_byte_perm (w[14], w[15], selector);
      w[33] = hc_byte_perm (w[13], w[14], selector);
      w[32] = hc_byte_perm (w[12], w[13], selector);
      w[31] = hc_byte_perm (w[11], w[12], selector);
      w[30] = hc_byte_perm (w[10], w[11], selector);
      w[29] = hc_byte_perm (w[ 9], w[10], selector);
      w[28] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[27] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[26] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[25] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[24] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[23] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[22] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[21] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[20] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[19] = hc_byte_perm (    0, w[ 0], selector);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_byte_perm (w[42], w[43], selector);
      w[62] = hc_byte_perm (w[41], w[42], selector);
      w[61] = hc_byte_perm (w[40], w[41], selector);
      w[60] = hc_byte_perm (w[39], w[40], selector);
      w[59] = hc_byte_perm (w[38], w[39], selector);
      w[58] = hc_byte_perm (w[37], w[38], selector);
      w[57] = hc_byte_perm (w[36], w[37], selector);
      w[56] = hc_byte_perm (w[35], w[36], selector);
      w[55] = hc_byte_perm (w[34], w[35], selector);
      w[54] = hc_byte_perm (w[33], w[34], selector);
      w[53] = hc_byte_perm (w[32], w[33], selector);
      w[52] = hc_byte_perm (w[31], w[32], selector);
      w[51] = hc_byte_perm (w[30], w[31], selector);
      w[50] = hc_byte_perm (w[29], w[30], selector);
      w[49] = hc_byte_perm (w[28], w[29], selector);
      w[48] = hc_byte_perm (w[27], w[28], selector);
      w[47] = hc_byte_perm (w[26], w[27], selector);
      w[46] = hc_byte_perm (w[25], w[26], selector);
      w[45] = hc_byte_perm (w[24], w[25], selector);
      w[44] = hc_byte_perm (w[23], w[24], selector);
      w[43] = hc_byte_perm (w[22], w[23], selector);
      w[42] = hc_byte_perm (w[21], w[22], selector);
      w[41] = hc_byte_perm (w[20], w[21], selector);
      w[40] = hc_byte_perm (w[19], w[20], selector);
      w[39] = hc_byte_perm (w[18], w[19], selector);
      w[38] = hc_byte_perm (w[17], w[18], selector);
      w[37] = hc_byte_perm (w[16], w[17], selector);
      w[36] = hc_byte_perm (w[15], w[16], selector);
      w[35] = hc_byte_perm (w[14], w[15], selector);
      w[34] = hc_byte_perm (w[13], w[14], selector);
      w[33] = hc_byte_perm (w[12], w[13], selector);
      w[32] = hc_byte_perm (w[11], w[12], selector);
      w[31] = hc_byte_perm (w[10], w[11], selector);
      w[30] = hc_byte_perm (w[ 9], w[10], selector);
      w[29] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[28] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[27] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[26] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[25] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[24] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[23] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[22] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[21] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[20] = hc_byte_perm (    0, w[ 0], selector);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_byte_perm (w[41], w[42], selector);
      w[62] = hc_byte_perm (w[40], w[41], selector);
      w[61] = hc_byte_perm (w[39], w[40], selector);
      w[60] = hc_byte_perm (w[38], w[39], selector);
      w[59] = hc_byte_perm (w[37], w[38], selector);
      w[58] = hc_byte_perm (w[36], w[37], selector);
      w[57] = hc_byte_perm (w[35], w[36], selector);
      w[56] = hc_byte_perm (w[34], w[35], selector);
      w[55] = hc_byte_perm (w[33], w[34], selector);
      w[54] = hc_byte_perm (w[32], w[33], selector);
      w[53] = hc_byte_perm (w[31], w[32], selector);
      w[52] = hc_byte_perm (w[30], w[31], selector);
      w[51] = hc_byte_perm (w[29], w[30], selector);
      w[50] = hc_byte_perm (w[28], w[29], selector);
      w[49] = hc_byte_perm (w[27], w[28], selector);
      w[48] = hc_byte_perm (w[26], w[27], selector);
      w[47] = hc_byte_perm (w[25], w[26], selector);
      w[46] = hc_byte_perm (w[24], w[25], selector);
      w[45] = hc_byte_perm (w[23], w[24], selector);
      w[44] = hc_byte_perm (w[22], w[23], selector);
      w[43] = hc_byte_perm (w[21], w[22], selector);
      w[42] = hc_byte_perm (w[20], w[21], selector);
      w[41] = hc_byte_perm (w[19], w[20], selector);
      w[40] = hc_byte_perm (w[18], w[19], selector);
      w[39] = hc_byte_perm (w[17], w[18], selector);
      w[38] = hc_byte_perm (w[16], w[17], selector);
      w[37] = hc_byte_perm (w[15], w[16], selector);
      w[36] = hc_byte_perm (w[14], w[15], selector);
      w[35] = hc_byte_perm (w[13], w[14], selector);
      w[34] = hc_byte_perm (w[12], w[13], selector);
      w[33] = hc_byte_perm (w[11], w[12], selector);
      w[32] = hc_byte_perm (w[10], w[11], selector);
      w[31] = hc_byte_perm (w[ 9], w[10], selector);
      w[30] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[29] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[28] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[27] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[26] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[25] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[24] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[23] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[22] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[21] = hc_byte_perm (    0, w[ 0], selector);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_byte_perm (w[40], w[41], selector);
      w[62] = hc_byte_perm (w[39], w[40], selector);
      w[61] = hc_byte_perm (w[38], w[39], selector);
      w[60] = hc_byte_perm (w[37], w[38], selector);
      w[59] = hc_byte_perm (w[36], w[37], selector);
      w[58] = hc_byte_perm (w[35], w[36], selector);
      w[57] = hc_byte_perm (w[34], w[35], selector);
      w[56] = hc_byte_perm (w[33], w[34], selector);
      w[55] = hc_byte_perm (w[32], w[33], selector);
      w[54] = hc_byte_perm (w[31], w[32], selector);
      w[53] = hc_byte_perm (w[30], w[31], selector);
      w[52] = hc_byte_perm (w[29], w[30], selector);
      w[51] = hc_byte_perm (w[28], w[29], selector);
      w[50] = hc_byte_perm (w[27], w[28], selector);
      w[49] = hc_byte_perm (w[26], w[27], selector);
      w[48] = hc_byte_perm (w[25], w[26], selector);
      w[47] = hc_byte_perm (w[24], w[25], selector);
      w[46] = hc_byte_perm (w[23], w[24], selector);
      w[45] = hc_byte_perm (w[22], w[23], selector);
      w[44] = hc_byte_perm (w[21], w[22], selector);
      w[43] = hc_byte_perm (w[20], w[21], selector);
      w[42] = hc_byte_perm (w[19], w[20], selector);
      w[41] = hc_byte_perm (w[18], w[19], selector);
      w[40] = hc_byte_perm (w[17], w[18], selector);
      w[39] = hc_byte_perm (w[16], w[17], selector);
      w[38] = hc_byte_perm (w[15], w[16], selector);
      w[37] = hc_byte_perm (w[14], w[15], selector);
      w[36] = hc_byte_perm (w[13], w[14], selector);
      w[35] = hc_byte_perm (w[12], w[13], selector);
      w[34] = hc_byte_perm (w[11], w[12], selector);
      w[33] = hc_byte_perm (w[10], w[11], selector);
      w[32] = hc_byte_perm (w[ 9], w[10], selector);
      w[31] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[30] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[29] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[28] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[27] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[26] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[25] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[24] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[23] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[22] = hc_byte_perm (    0, w[ 0], selector);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_byte_perm (w[39], w[40], selector);
      w[62] = hc_byte_perm (w[38], w[39], selector);
      w[61] = hc_byte_perm (w[37], w[38], selector);
      w[60] = hc_byte_perm (w[36], w[37], selector);
      w[59] = hc_byte_perm (w[35], w[36], selector);
      w[58] = hc_byte_perm (w[34], w[35], selector);
      w[57] = hc_byte_perm (w[33], w[34], selector);
      w[56] = hc_byte_perm (w[32], w[33], selector);
      w[55] = hc_byte_perm (w[31], w[32], selector);
      w[54] = hc_byte_perm (w[30], w[31], selector);
      w[53] = hc_byte_perm (w[29], w[30], selector);
      w[52] = hc_byte_perm (w[28], w[29], selector);
      w[51] = hc_byte_perm (w[27], w[28], selector);
      w[50] = hc_byte_perm (w[26], w[27], selector);
      w[49] = hc_byte_perm (w[25], w[26], selector);
      w[48] = hc_byte_perm (w[24], w[25], selector);
      w[47] = hc_byte_perm (w[23], w[24], selector);
      w[46] = hc_byte_perm (w[22], w[23], selector);
      w[45] = hc_byte_perm (w[21], w[22], selector);
      w[44] = hc_byte_perm (w[20], w[21], selector);
      w[43] = hc_byte_perm (w[19], w[20], selector);
      w[42] = hc_byte_perm (w[18], w[19], selector);
      w[41] = hc_byte_perm (w[17], w[18], selector);
      w[40] = hc_byte_perm (w[16], w[17], selector);
      w[39] = hc_byte_perm (w[15], w[16], selector);
      w[38] = hc_byte_perm (w[14], w[15], selector);
      w[37] = hc_byte_perm (w[13], w[14], selector);
      w[36] = hc_byte_perm (w[12], w[13], selector);
      w[35] = hc_byte_perm (w[11], w[12], selector);
      w[34] = hc_byte_perm (w[10], w[11], selector);
      w[33] = hc_byte_perm (w[ 9], w[10], selector);
      w[32] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[31] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[30] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[29] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[28] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[27] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[26] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[25] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[24] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[23] = hc_byte_perm (    0, w[ 0], selector);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_byte_perm (w[38], w[39], selector);
      w[62] = hc_byte_perm (w[37], w[38], selector);
      w[61] = hc_byte_perm (w[36], w[37], selector);
      w[60] = hc_byte_perm (w[35], w[36], selector);
      w[59] = hc_byte_perm (w[34], w[35], selector);
      w[58] = hc_byte_perm (w[33], w[34], selector);
      w[57] = hc_byte_perm (w[32], w[33], selector);
      w[56] = hc_byte_perm (w[31], w[32], selector);
      w[55] = hc_byte_perm (w[30], w[31], selector);
      w[54] = hc_byte_perm (w[29], w[30], selector);
      w[53] = hc_byte_perm (w[28], w[29], selector);
      w[52] = hc_byte_perm (w[27], w[28], selector);
      w[51] = hc_byte_perm (w[26], w[27], selector);
      w[50] = hc_byte_perm (w[25], w[26], selector);
      w[49] = hc_byte_perm (w[24], w[25], selector);
      w[48] = hc_byte_perm (w[23], w[24], selector);
      w[47] = hc_byte_perm (w[22], w[23], selector);
      w[46] = hc_byte_perm (w[21], w[22], selector);
      w[45] = hc_byte_perm (w[20], w[21], selector);
      w[44] = hc_byte_perm (w[19], w[20], selector);
      w[43] = hc_byte_perm (w[18], w[19], selector);
      w[42] = hc_byte_perm (w[17], w[18], selector);
      w[41] = hc_byte_perm (w[16], w[17], selector);
      w[40] = hc_byte_perm (w[15], w[16], selector);
      w[39] = hc_byte_perm (w[14], w[15], selector);
      w[38] = hc_byte_perm (w[13], w[14], selector);
      w[37] = hc_byte_perm (w[12], w[13], selector);
      w[36] = hc_byte_perm (w[11], w[12], selector);
      w[35] = hc_byte_perm (w[10], w[11], selector);
      w[34] = hc_byte_perm (w[ 9], w[10], selector);
      w[33] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[32] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[31] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[30] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[29] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[28] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[27] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[26] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[25] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[24] = hc_byte_perm (    0, w[ 0], selector);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_byte_perm (w[37], w[38], selector);
      w[62] = hc_byte_perm (w[36], w[37], selector);
      w[61] = hc_byte_perm (w[35], w[36], selector);
      w[60] = hc_byte_perm (w[34], w[35], selector);
      w[59] = hc_byte_perm (w[33], w[34], selector);
      w[58] = hc_byte_perm (w[32], w[33], selector);
      w[57] = hc_byte_perm (w[31], w[32], selector);
      w[56] = hc_byte_perm (w[30], w[31], selector);
      w[55] = hc_byte_perm (w[29], w[30], selector);
      w[54] = hc_byte_perm (w[28], w[29], selector);
      w[53] = hc_byte_perm (w[27], w[28], selector);
      w[52] = hc_byte_perm (w[26], w[27], selector);
      w[51] = hc_byte_perm (w[25], w[26], selector);
      w[50] = hc_byte_perm (w[24], w[25], selector);
      w[49] = hc_byte_perm (w[23], w[24], selector);
      w[48] = hc_byte_perm (w[22], w[23], selector);
      w[47] = hc_byte_perm (w[21], w[22], selector);
      w[46] = hc_byte_perm (w[20], w[21], selector);
      w[45] = hc_byte_perm (w[19], w[20], selector);
      w[44] = hc_byte_perm (w[18], w[19], selector);
      w[43] = hc_byte_perm (w[17], w[18], selector);
      w[42] = hc_byte_perm (w[16], w[17], selector);
      w[41] = hc_byte_perm (w[15], w[16], selector);
      w[40] = hc_byte_perm (w[14], w[15], selector);
      w[39] = hc_byte_perm (w[13], w[14], selector);
      w[38] = hc_byte_perm (w[12], w[13], selector);
      w[37] = hc_byte_perm (w[11], w[12], selector);
      w[36] = hc_byte_perm (w[10], w[11], selector);
      w[35] = hc_byte_perm (w[ 9], w[10], selector);
      w[34] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[33] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[32] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[31] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[30] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[29] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[28] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[27] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[26] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[25] = hc_byte_perm (    0, w[ 0], selector);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_byte_perm (w[36], w[37], selector);
      w[62] = hc_byte_perm (w[35], w[36], selector);
      w[61] = hc_byte_perm (w[34], w[35], selector);
      w[60] = hc_byte_perm (w[33], w[34], selector);
      w[59] = hc_byte_perm (w[32], w[33], selector);
      w[58] = hc_byte_perm (w[31], w[32], selector);
      w[57] = hc_byte_perm (w[30], w[31], selector);
      w[56] = hc_byte_perm (w[29], w[30], selector);
      w[55] = hc_byte_perm (w[28], w[29], selector);
      w[54] = hc_byte_perm (w[27], w[28], selector);
      w[53] = hc_byte_perm (w[26], w[27], selector);
      w[52] = hc_byte_perm (w[25], w[26], selector);
      w[51] = hc_byte_perm (w[24], w[25], selector);
      w[50] = hc_byte_perm (w[23], w[24], selector);
      w[49] = hc_byte_perm (w[22], w[23], selector);
      w[48] = hc_byte_perm (w[21], w[22], selector);
      w[47] = hc_byte_perm (w[20], w[21], selector);
      w[46] = hc_byte_perm (w[19], w[20], selector);
      w[45] = hc_byte_perm (w[18], w[19], selector);
      w[44] = hc_byte_perm (w[17], w[18], selector);
      w[43] = hc_byte_perm (w[16], w[17], selector);
      w[42] = hc_byte_perm (w[15], w[16], selector);
      w[41] = hc_byte_perm (w[14], w[15], selector);
      w[40] = hc_byte_perm (w[13], w[14], selector);
      w[39] = hc_byte_perm (w[12], w[13], selector);
      w[38] = hc_byte_perm (w[11], w[12], selector);
      w[37] = hc_byte_perm (w[10], w[11], selector);
      w[36] = hc_byte_perm (w[ 9], w[10], selector);
      w[35] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[34] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[33] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[32] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[31] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[30] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[29] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[28] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[27] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[26] = hc_byte_perm (    0, w[ 0], selector);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_byte_perm (w[35], w[36], selector);
      w[62] = hc_byte_perm (w[34], w[35], selector);
      w[61] = hc_byte_perm (w[33], w[34], selector);
      w[60] = hc_byte_perm (w[32], w[33], selector);
      w[59] = hc_byte_perm (w[31], w[32], selector);
      w[58] = hc_byte_perm (w[30], w[31], selector);
      w[57] = hc_byte_perm (w[29], w[30], selector);
      w[56] = hc_byte_perm (w[28], w[29], selector);
      w[55] = hc_byte_perm (w[27], w[28], selector);
      w[54] = hc_byte_perm (w[26], w[27], selector);
      w[53] = hc_byte_perm (w[25], w[26], selector);
      w[52] = hc_byte_perm (w[24], w[25], selector);
      w[51] = hc_byte_perm (w[23], w[24], selector);
      w[50] = hc_byte_perm (w[22], w[23], selector);
      w[49] = hc_byte_perm (w[21], w[22], selector);
      w[48] = hc_byte_perm (w[20], w[21], selector);
      w[47] = hc_byte_perm (w[19], w[20], selector);
      w[46] = hc_byte_perm (w[18], w[19], selector);
      w[45] = hc_byte_perm (w[17], w[18], selector);
      w[44] = hc_byte_perm (w[16], w[17], selector);
      w[43] = hc_byte_perm (w[15], w[16], selector);
      w[42] = hc_byte_perm (w[14], w[15], selector);
      w[41] = hc_byte_perm (w[13], w[14], selector);
      w[40] = hc_byte_perm (w[12], w[13], selector);
      w[39] = hc_byte_perm (w[11], w[12], selector);
      w[38] = hc_byte_perm (w[10], w[11], selector);
      w[37] = hc_byte_perm (w[ 9], w[10], selector);
      w[36] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[35] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[34] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[33] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[32] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[31] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[30] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[29] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[28] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[27] = hc_byte_perm (    0, w[ 0], selector);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_byte_perm (w[34], w[35], selector);
      w[62] = hc_byte_perm (w[33], w[34], selector);
      w[61] = hc_byte_perm (w[32], w[33], selector);
      w[60] = hc_byte_perm (w[31], w[32], selector);
      w[59] = hc_byte_perm (w[30], w[31], selector);
      w[58] = hc_byte_perm (w[29], w[30], selector);
      w[57] = hc_byte_perm (w[28], w[29], selector);
      w[56] = hc_byte_perm (w[27], w[28], selector);
      w[55] = hc_byte_perm (w[26], w[27], selector);
      w[54] = hc_byte_perm (w[25], w[26], selector);
      w[53] = hc_byte_perm (w[24], w[25], selector);
      w[52] = hc_byte_perm (w[23], w[24], selector);
      w[51] = hc_byte_perm (w[22], w[23], selector);
      w[50] = hc_byte_perm (w[21], w[22], selector);
      w[49] = hc_byte_perm (w[20], w[21], selector);
      w[48] = hc_byte_perm (w[19], w[20], selector);
      w[47] = hc_byte_perm (w[18], w[19], selector);
      w[46] = hc_byte_perm (w[17], w[18], selector);
      w[45] = hc_byte_perm (w[16], w[17], selector);
      w[44] = hc_byte_perm (w[15], w[16], selector);
      w[43] = hc_byte_perm (w[14], w[15], selector);
      w[42] = hc_byte_perm (w[13], w[14], selector);
      w[41] = hc_byte_perm (w[12], w[13], selector);
      w[40] = hc_byte_perm (w[11], w[12], selector);
      w[39] = hc_byte_perm (w[10], w[11], selector);
      w[38] = hc_byte_perm (w[ 9], w[10], selector);
      w[37] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[36] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[35] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[34] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[33] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[32] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[31] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[30] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[29] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[28] = hc_byte_perm (    0, w[ 0], selector);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_byte_perm (w[33], w[34], selector);
      w[62] = hc_byte_perm (w[32], w[33], selector);
      w[61] = hc_byte_perm (w[31], w[32], selector);
      w[60] = hc_byte_perm (w[30], w[31], selector);
      w[59] = hc_byte_perm (w[29], w[30], selector);
      w[58] = hc_byte_perm (w[28], w[29], selector);
      w[57] = hc_byte_perm (w[27], w[28], selector);
      w[56] = hc_byte_perm (w[26], w[27], selector);
      w[55] = hc_byte_perm (w[25], w[26], selector);
      w[54] = hc_byte_perm (w[24], w[25], selector);
      w[53] = hc_byte_perm (w[23], w[24], selector);
      w[52] = hc_byte_perm (w[22], w[23], selector);
      w[51] = hc_byte_perm (w[21], w[22], selector);
      w[50] = hc_byte_perm (w[20], w[21], selector);
      w[49] = hc_byte_perm (w[19], w[20], selector);
      w[48] = hc_byte_perm (w[18], w[19], selector);
      w[47] = hc_byte_perm (w[17], w[18], selector);
      w[46] = hc_byte_perm (w[16], w[17], selector);
      w[45] = hc_byte_perm (w[15], w[16], selector);
      w[44] = hc_byte_perm (w[14], w[15], selector);
      w[43] = hc_byte_perm (w[13], w[14], selector);
      w[42] = hc_byte_perm (w[12], w[13], selector);
      w[41] = hc_byte_perm (w[11], w[12], selector);
      w[40] = hc_byte_perm (w[10], w[11], selector);
      w[39] = hc_byte_perm (w[ 9], w[10], selector);
      w[38] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[37] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[36] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[35] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[34] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[33] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[32] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[31] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[30] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[29] = hc_byte_perm (    0, w[ 0], selector);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_byte_perm (w[32], w[33], selector);
      w[62] = hc_byte_perm (w[31], w[32], selector);
      w[61] = hc_byte_perm (w[30], w[31], selector);
      w[60] = hc_byte_perm (w[29], w[30], selector);
      w[59] = hc_byte_perm (w[28], w[29], selector);
      w[58] = hc_byte_perm (w[27], w[28], selector);
      w[57] = hc_byte_perm (w[26], w[27], selector);
      w[56] = hc_byte_perm (w[25], w[26], selector);
      w[55] = hc_byte_perm (w[24], w[25], selector);
      w[54] = hc_byte_perm (w[23], w[24], selector);
      w[53] = hc_byte_perm (w[22], w[23], selector);
      w[52] = hc_byte_perm (w[21], w[22], selector);
      w[51] = hc_byte_perm (w[20], w[21], selector);
      w[50] = hc_byte_perm (w[19], w[20], selector);
      w[49] = hc_byte_perm (w[18], w[19], selector);
      w[48] = hc_byte_perm (w[17], w[18], selector);
      w[47] = hc_byte_perm (w[16], w[17], selector);
      w[46] = hc_byte_perm (w[15], w[16], selector);
      w[45] = hc_byte_perm (w[14], w[15], selector);
      w[44] = hc_byte_perm (w[13], w[14], selector);
      w[43] = hc_byte_perm (w[12], w[13], selector);
      w[42] = hc_byte_perm (w[11], w[12], selector);
      w[41] = hc_byte_perm (w[10], w[11], selector);
      w[40] = hc_byte_perm (w[ 9], w[10], selector);
      w[39] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[38] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[37] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[36] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[35] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[34] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[33] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[32] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[31] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[30] = hc_byte_perm (    0, w[ 0], selector);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_byte_perm (w[31], w[32], selector);
      w[62] = hc_byte_perm (w[30], w[31], selector);
      w[61] = hc_byte_perm (w[29], w[30], selector);
      w[60] = hc_byte_perm (w[28], w[29], selector);
      w[59] = hc_byte_perm (w[27], w[28], selector);
      w[58] = hc_byte_perm (w[26], w[27], selector);
      w[57] = hc_byte_perm (w[25], w[26], selector);
      w[56] = hc_byte_perm (w[24], w[25], selector);
      w[55] = hc_byte_perm (w[23], w[24], selector);
      w[54] = hc_byte_perm (w[22], w[23], selector);
      w[53] = hc_byte_perm (w[21], w[22], selector);
      w[52] = hc_byte_perm (w[20], w[21], selector);
      w[51] = hc_byte_perm (w[19], w[20], selector);
      w[50] = hc_byte_perm (w[18], w[19], selector);
      w[49] = hc_byte_perm (w[17], w[18], selector);
      w[48] = hc_byte_perm (w[16], w[17], selector);
      w[47] = hc_byte_perm (w[15], w[16], selector);
      w[46] = hc_byte_perm (w[14], w[15], selector);
      w[45] = hc_byte_perm (w[13], w[14], selector);
      w[44] = hc_byte_perm (w[12], w[13], selector);
      w[43] = hc_byte_perm (w[11], w[12], selector);
      w[42] = hc_byte_perm (w[10], w[11], selector);
      w[41] = hc_byte_perm (w[ 9], w[10], selector);
      w[40] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[39] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[38] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[37] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[36] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[35] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[34] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[33] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[32] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[31] = hc_byte_perm (    0, w[ 0], selector);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_byte_perm (w[30], w[31], selector);
      w[62] = hc_byte_perm (w[29], w[30], selector);
      w[61] = hc_byte_perm (w[28], w[29], selector);
      w[60] = hc_byte_perm (w[27], w[28], selector);
      w[59] = hc_byte_perm (w[26], w[27], selector);
      w[58] = hc_byte_perm (w[25], w[26], selector);
      w[57] = hc_byte_perm (w[24], w[25], selector);
      w[56] = hc_byte_perm (w[23], w[24], selector);
      w[55] = hc_byte_perm (w[22], w[23], selector);
      w[54] = hc_byte_perm (w[21], w[22], selector);
      w[53] = hc_byte_perm (w[20], w[21], selector);
      w[52] = hc_byte_perm (w[19], w[20], selector);
      w[51] = hc_byte_perm (w[18], w[19], selector);
      w[50] = hc_byte_perm (w[17], w[18], selector);
      w[49] = hc_byte_perm (w[16], w[17], selector);
      w[48] = hc_byte_perm (w[15], w[16], selector);
      w[47] = hc_byte_perm (w[14], w[15], selector);
      w[46] = hc_byte_perm (w[13], w[14], selector);
      w[45] = hc_byte_perm (w[12], w[13], selector);
      w[44] = hc_byte_perm (w[11], w[12], selector);
      w[43] = hc_byte_perm (w[10], w[11], selector);
      w[42] = hc_byte_perm (w[ 9], w[10], selector);
      w[41] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[40] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[39] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[38] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[37] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[36] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[35] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[34] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[33] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[32] = hc_byte_perm (    0, w[ 0], selector);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_byte_perm (w[29], w[30], selector);
      w[62] = hc_byte_perm (w[28], w[29], selector);
      w[61] = hc_byte_perm (w[27], w[28], selector);
      w[60] = hc_byte_perm (w[26], w[27], selector);
      w[59] = hc_byte_perm (w[25], w[26], selector);
      w[58] = hc_byte_perm (w[24], w[25], selector);
      w[57] = hc_byte_perm (w[23], w[24], selector);
      w[56] = hc_byte_perm (w[22], w[23], selector);
      w[55] = hc_byte_perm (w[21], w[22], selector);
      w[54] = hc_byte_perm (w[20], w[21], selector);
      w[53] = hc_byte_perm (w[19], w[20], selector);
      w[52] = hc_byte_perm (w[18], w[19], selector);
      w[51] = hc_byte_perm (w[17], w[18], selector);
      w[50] = hc_byte_perm (w[16], w[17], selector);
      w[49] = hc_byte_perm (w[15], w[16], selector);
      w[48] = hc_byte_perm (w[14], w[15], selector);
      w[47] = hc_byte_perm (w[13], w[14], selector);
      w[46] = hc_byte_perm (w[12], w[13], selector);
      w[45] = hc_byte_perm (w[11], w[12], selector);
      w[44] = hc_byte_perm (w[10], w[11], selector);
      w[43] = hc_byte_perm (w[ 9], w[10], selector);
      w[42] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[41] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[40] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[39] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[38] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[37] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[36] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[35] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[34] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[33] = hc_byte_perm (    0, w[ 0], selector);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_byte_perm (w[28], w[29], selector);
      w[62] = hc_byte_perm (w[27], w[28], selector);
      w[61] = hc_byte_perm (w[26], w[27], selector);
      w[60] = hc_byte_perm (w[25], w[26], selector);
      w[59] = hc_byte_perm (w[24], w[25], selector);
      w[58] = hc_byte_perm (w[23], w[24], selector);
      w[57] = hc_byte_perm (w[22], w[23], selector);
      w[56] = hc_byte_perm (w[21], w[22], selector);
      w[55] = hc_byte_perm (w[20], w[21], selector);
      w[54] = hc_byte_perm (w[19], w[20], selector);
      w[53] = hc_byte_perm (w[18], w[19], selector);
      w[52] = hc_byte_perm (w[17], w[18], selector);
      w[51] = hc_byte_perm (w[16], w[17], selector);
      w[50] = hc_byte_perm (w[15], w[16], selector);
      w[49] = hc_byte_perm (w[14], w[15], selector);
      w[48] = hc_byte_perm (w[13], w[14], selector);
      w[47] = hc_byte_perm (w[12], w[13], selector);
      w[46] = hc_byte_perm (w[11], w[12], selector);
      w[45] = hc_byte_perm (w[10], w[11], selector);
      w[44] = hc_byte_perm (w[ 9], w[10], selector);
      w[43] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[42] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[41] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[40] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[39] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[38] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[37] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[36] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[35] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[34] = hc_byte_perm (    0, w[ 0], selector);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_byte_perm (w[27], w[28], selector);
      w[62] = hc_byte_perm (w[26], w[27], selector);
      w[61] = hc_byte_perm (w[25], w[26], selector);
      w[60] = hc_byte_perm (w[24], w[25], selector);
      w[59] = hc_byte_perm (w[23], w[24], selector);
      w[58] = hc_byte_perm (w[22], w[23], selector);
      w[57] = hc_byte_perm (w[21], w[22], selector);
      w[56] = hc_byte_perm (w[20], w[21], selector);
      w[55] = hc_byte_perm (w[19], w[20], selector);
      w[54] = hc_byte_perm (w[18], w[19], selector);
      w[53] = hc_byte_perm (w[17], w[18], selector);
      w[52] = hc_byte_perm (w[16], w[17], selector);
      w[51] = hc_byte_perm (w[15], w[16], selector);
      w[50] = hc_byte_perm (w[14], w[15], selector);
      w[49] = hc_byte_perm (w[13], w[14], selector);
      w[48] = hc_byte_perm (w[12], w[13], selector);
      w[47] = hc_byte_perm (w[11], w[12], selector);
      w[46] = hc_byte_perm (w[10], w[11], selector);
      w[45] = hc_byte_perm (w[ 9], w[10], selector);
      w[44] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[43] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[42] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[41] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[40] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[39] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[38] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[37] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[36] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[35] = hc_byte_perm (    0, w[ 0], selector);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_byte_perm (w[26], w[27], selector);
      w[62] = hc_byte_perm (w[25], w[26], selector);
      w[61] = hc_byte_perm (w[24], w[25], selector);
      w[60] = hc_byte_perm (w[23], w[24], selector);
      w[59] = hc_byte_perm (w[22], w[23], selector);
      w[58] = hc_byte_perm (w[21], w[22], selector);
      w[57] = hc_byte_perm (w[20], w[21], selector);
      w[56] = hc_byte_perm (w[19], w[20], selector);
      w[55] = hc_byte_perm (w[18], w[19], selector);
      w[54] = hc_byte_perm (w[17], w[18], selector);
      w[53] = hc_byte_perm (w[16], w[17], selector);
      w[52] = hc_byte_perm (w[15], w[16], selector);
      w[51] = hc_byte_perm (w[14], w[15], selector);
      w[50] = hc_byte_perm (w[13], w[14], selector);
      w[49] = hc_byte_perm (w[12], w[13], selector);
      w[48] = hc_byte_perm (w[11], w[12], selector);
      w[47] = hc_byte_perm (w[10], w[11], selector);
      w[46] = hc_byte_perm (w[ 9], w[10], selector);
      w[45] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[44] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[43] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[42] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[41] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[40] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[39] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[38] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[37] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[36] = hc_byte_perm (    0, w[ 0], selector);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_byte_perm (w[25], w[26], selector);
      w[62] = hc_byte_perm (w[24], w[25], selector);
      w[61] = hc_byte_perm (w[23], w[24], selector);
      w[60] = hc_byte_perm (w[22], w[23], selector);
      w[59] = hc_byte_perm (w[21], w[22], selector);
      w[58] = hc_byte_perm (w[20], w[21], selector);
      w[57] = hc_byte_perm (w[19], w[20], selector);
      w[56] = hc_byte_perm (w[18], w[19], selector);
      w[55] = hc_byte_perm (w[17], w[18], selector);
      w[54] = hc_byte_perm (w[16], w[17], selector);
      w[53] = hc_byte_perm (w[15], w[16], selector);
      w[52] = hc_byte_perm (w[14], w[15], selector);
      w[51] = hc_byte_perm (w[13], w[14], selector);
      w[50] = hc_byte_perm (w[12], w[13], selector);
      w[49] = hc_byte_perm (w[11], w[12], selector);
      w[48] = hc_byte_perm (w[10], w[11], selector);
      w[47] = hc_byte_perm (w[ 9], w[10], selector);
      w[46] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[45] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[44] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[43] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[42] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[41] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[40] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[39] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[38] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[37] = hc_byte_perm (    0, w[ 0], selector);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_byte_perm (w[24], w[25], selector);
      w[62] = hc_byte_perm (w[23], w[24], selector);
      w[61] = hc_byte_perm (w[22], w[23], selector);
      w[60] = hc_byte_perm (w[21], w[22], selector);
      w[59] = hc_byte_perm (w[20], w[21], selector);
      w[58] = hc_byte_perm (w[19], w[20], selector);
      w[57] = hc_byte_perm (w[18], w[19], selector);
      w[56] = hc_byte_perm (w[17], w[18], selector);
      w[55] = hc_byte_perm (w[16], w[17], selector);
      w[54] = hc_byte_perm (w[15], w[16], selector);
      w[53] = hc_byte_perm (w[14], w[15], selector);
      w[52] = hc_byte_perm (w[13], w[14], selector);
      w[51] = hc_byte_perm (w[12], w[13], selector);
      w[50] = hc_byte_perm (w[11], w[12], selector);
      w[49] = hc_byte_perm (w[10], w[11], selector);
      w[48] = hc_byte_perm (w[ 9], w[10], selector);
      w[47] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[46] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[45] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[44] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[43] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[42] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[41] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[40] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[39] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[38] = hc_byte_perm (    0, w[ 0], selector);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_byte_perm (w[23], w[24], selector);
      w[62] = hc_byte_perm (w[22], w[23], selector);
      w[61] = hc_byte_perm (w[21], w[22], selector);
      w[60] = hc_byte_perm (w[20], w[21], selector);
      w[59] = hc_byte_perm (w[19], w[20], selector);
      w[58] = hc_byte_perm (w[18], w[19], selector);
      w[57] = hc_byte_perm (w[17], w[18], selector);
      w[56] = hc_byte_perm (w[16], w[17], selector);
      w[55] = hc_byte_perm (w[15], w[16], selector);
      w[54] = hc_byte_perm (w[14], w[15], selector);
      w[53] = hc_byte_perm (w[13], w[14], selector);
      w[52] = hc_byte_perm (w[12], w[13], selector);
      w[51] = hc_byte_perm (w[11], w[12], selector);
      w[50] = hc_byte_perm (w[10], w[11], selector);
      w[49] = hc_byte_perm (w[ 9], w[10], selector);
      w[48] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[47] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[46] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[45] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[44] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[43] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[42] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[41] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[40] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[39] = hc_byte_perm (    0, w[ 0], selector);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_byte_perm (w[22], w[23], selector);
      w[62] = hc_byte_perm (w[21], w[22], selector);
      w[61] = hc_byte_perm (w[20], w[21], selector);
      w[60] = hc_byte_perm (w[19], w[20], selector);
      w[59] = hc_byte_perm (w[18], w[19], selector);
      w[58] = hc_byte_perm (w[17], w[18], selector);
      w[57] = hc_byte_perm (w[16], w[17], selector);
      w[56] = hc_byte_perm (w[15], w[16], selector);
      w[55] = hc_byte_perm (w[14], w[15], selector);
      w[54] = hc_byte_perm (w[13], w[14], selector);
      w[53] = hc_byte_perm (w[12], w[13], selector);
      w[52] = hc_byte_perm (w[11], w[12], selector);
      w[51] = hc_byte_perm (w[10], w[11], selector);
      w[50] = hc_byte_perm (w[ 9], w[10], selector);
      w[49] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[48] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[47] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[46] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[45] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[44] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[43] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[42] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[41] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[40] = hc_byte_perm (    0, w[ 0], selector);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_byte_perm (w[21], w[22], selector);
      w[62] = hc_byte_perm (w[20], w[21], selector);
      w[61] = hc_byte_perm (w[19], w[20], selector);
      w[60] = hc_byte_perm (w[18], w[19], selector);
      w[59] = hc_byte_perm (w[17], w[18], selector);
      w[58] = hc_byte_perm (w[16], w[17], selector);
      w[57] = hc_byte_perm (w[15], w[16], selector);
      w[56] = hc_byte_perm (w[14], w[15], selector);
      w[55] = hc_byte_perm (w[13], w[14], selector);
      w[54] = hc_byte_perm (w[12], w[13], selector);
      w[53] = hc_byte_perm (w[11], w[12], selector);
      w[52] = hc_byte_perm (w[10], w[11], selector);
      w[51] = hc_byte_perm (w[ 9], w[10], selector);
      w[50] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[49] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[48] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[47] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[46] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[45] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[44] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[43] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[42] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[41] = hc_byte_perm (    0, w[ 0], selector);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_byte_perm (w[20], w[21], selector);
      w[62] = hc_byte_perm (w[19], w[20], selector);
      w[61] = hc_byte_perm (w[18], w[19], selector);
      w[60] = hc_byte_perm (w[17], w[18], selector);
      w[59] = hc_byte_perm (w[16], w[17], selector);
      w[58] = hc_byte_perm (w[15], w[16], selector);
      w[57] = hc_byte_perm (w[14], w[15], selector);
      w[56] = hc_byte_perm (w[13], w[14], selector);
      w[55] = hc_byte_perm (w[12], w[13], selector);
      w[54] = hc_byte_perm (w[11], w[12], selector);
      w[53] = hc_byte_perm (w[10], w[11], selector);
      w[52] = hc_byte_perm (w[ 9], w[10], selector);
      w[51] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[50] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[49] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[48] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[47] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[46] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[45] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[44] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[43] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[42] = hc_byte_perm (    0, w[ 0], selector);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_byte_perm (w[19], w[20], selector);
      w[62] = hc_byte_perm (w[18], w[19], selector);
      w[61] = hc_byte_perm (w[17], w[18], selector);
      w[60] = hc_byte_perm (w[16], w[17], selector);
      w[59] = hc_byte_perm (w[15], w[16], selector);
      w[58] = hc_byte_perm (w[14], w[15], selector);
      w[57] = hc_byte_perm (w[13], w[14], selector);
      w[56] = hc_byte_perm (w[12], w[13], selector);
      w[55] = hc_byte_perm (w[11], w[12], selector);
      w[54] = hc_byte_perm (w[10], w[11], selector);
      w[53] = hc_byte_perm (w[ 9], w[10], selector);
      w[52] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[51] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[50] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[49] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[48] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[47] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[46] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[45] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[44] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[43] = hc_byte_perm (    0, w[ 0], selector);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_byte_perm (w[18], w[19], selector);
      w[62] = hc_byte_perm (w[17], w[18], selector);
      w[61] = hc_byte_perm (w[16], w[17], selector);
      w[60] = hc_byte_perm (w[15], w[16], selector);
      w[59] = hc_byte_perm (w[14], w[15], selector);
      w[58] = hc_byte_perm (w[13], w[14], selector);
      w[57] = hc_byte_perm (w[12], w[13], selector);
      w[56] = hc_byte_perm (w[11], w[12], selector);
      w[55] = hc_byte_perm (w[10], w[11], selector);
      w[54] = hc_byte_perm (w[ 9], w[10], selector);
      w[53] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[52] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[51] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[50] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[49] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[48] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[47] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[46] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[45] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[44] = hc_byte_perm (    0, w[ 0], selector);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_byte_perm (w[17], w[18], selector);
      w[62] = hc_byte_perm (w[16], w[17], selector);
      w[61] = hc_byte_perm (w[15], w[16], selector);
      w[60] = hc_byte_perm (w[14], w[15], selector);
      w[59] = hc_byte_perm (w[13], w[14], selector);
      w[58] = hc_byte_perm (w[12], w[13], selector);
      w[57] = hc_byte_perm (w[11], w[12], selector);
      w[56] = hc_byte_perm (w[10], w[11], selector);
      w[55] = hc_byte_perm (w[ 9], w[10], selector);
      w[54] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[53] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[52] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[51] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[50] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[49] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[48] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[47] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[46] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[45] = hc_byte_perm (    0, w[ 0], selector);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_byte_perm (w[16], w[17], selector);
      w[62] = hc_byte_perm (w[15], w[16], selector);
      w[61] = hc_byte_perm (w[14], w[15], selector);
      w[60] = hc_byte_perm (w[13], w[14], selector);
      w[59] = hc_byte_perm (w[12], w[13], selector);
      w[58] = hc_byte_perm (w[11], w[12], selector);
      w[57] = hc_byte_perm (w[10], w[11], selector);
      w[56] = hc_byte_perm (w[ 9], w[10], selector);
      w[55] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[54] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[53] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[52] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[51] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[50] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[49] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[48] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[47] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[46] = hc_byte_perm (    0, w[ 0], selector);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_byte_perm (w[15], w[16], selector);
      w[62] = hc_byte_perm (w[14], w[15], selector);
      w[61] = hc_byte_perm (w[13], w[14], selector);
      w[60] = hc_byte_perm (w[12], w[13], selector);
      w[59] = hc_byte_perm (w[11], w[12], selector);
      w[58] = hc_byte_perm (w[10], w[11], selector);
      w[57] = hc_byte_perm (w[ 9], w[10], selector);
      w[56] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[55] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[54] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[53] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[52] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[51] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[50] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[49] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[48] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[47] = hc_byte_perm (    0, w[ 0], selector);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_byte_perm (w[14], w[15], selector);
      w[62] = hc_byte_perm (w[13], w[14], selector);
      w[61] = hc_byte_perm (w[12], w[13], selector);
      w[60] = hc_byte_perm (w[11], w[12], selector);
      w[59] = hc_byte_perm (w[10], w[11], selector);
      w[58] = hc_byte_perm (w[ 9], w[10], selector);
      w[57] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[56] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[55] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[54] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[53] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[52] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[51] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[50] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[49] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[48] = hc_byte_perm (    0, w[ 0], selector);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_byte_perm (w[13], w[14], selector);
      w[62] = hc_byte_perm (w[12], w[13], selector);
      w[61] = hc_byte_perm (w[11], w[12], selector);
      w[60] = hc_byte_perm (w[10], w[11], selector);
      w[59] = hc_byte_perm (w[ 9], w[10], selector);
      w[58] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[57] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[56] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[55] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[54] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[53] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[52] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[51] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[50] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[49] = hc_byte_perm (    0, w[ 0], selector);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_byte_perm (w[12], w[13], selector);
      w[62] = hc_byte_perm (w[11], w[12], selector);
      w[61] = hc_byte_perm (w[10], w[11], selector);
      w[60] = hc_byte_perm (w[ 9], w[10], selector);
      w[59] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[58] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[57] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[56] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[55] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[54] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[53] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[52] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[51] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[50] = hc_byte_perm (    0, w[ 0], selector);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_byte_perm (w[11], w[12], selector);
      w[62] = hc_byte_perm (w[10], w[11], selector);
      w[61] = hc_byte_perm (w[ 9], w[10], selector);
      w[60] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[59] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[58] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[57] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[56] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[55] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[54] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[53] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[52] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[51] = hc_byte_perm (    0, w[ 0], selector);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_byte_perm (w[10], w[11], selector);
      w[62] = hc_byte_perm (w[ 9], w[10], selector);
      w[61] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[60] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[59] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[58] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[57] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[56] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[55] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[54] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[53] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[52] = hc_byte_perm (    0, w[ 0], selector);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_byte_perm (w[ 9], w[10], selector);
      w[62] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[61] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[60] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[59] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[58] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[57] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[56] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[55] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[54] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[53] = hc_byte_perm (    0, w[ 0], selector);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[62] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[61] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[60] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[59] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[58] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[57] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[56] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[55] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[54] = hc_byte_perm (    0, w[ 0], selector);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[62] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[61] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[60] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[59] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[58] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[57] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[56] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[55] = hc_byte_perm (    0, w[ 0], selector);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[62] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[61] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[60] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[59] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[58] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[57] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[56] = hc_byte_perm (    0, w[ 0], selector);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[62] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[61] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[60] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[59] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[58] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[57] = hc_byte_perm (    0, w[ 0], selector);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[62] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[61] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[60] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[59] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[58] = hc_byte_perm (    0, w[ 0], selector);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[62] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[61] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[60] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[59] = hc_byte_perm (    0, w[ 0], selector);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[62] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[61] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[60] = hc_byte_perm (    0, w[ 0], selector);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[62] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[61] = hc_byte_perm (    0, w[ 0], selector);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[62] = hc_byte_perm (    0, w[ 0], selector);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_byte_perm (    0, w[ 0], selector);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign (w[62], w[63], offset);
      w[62] = hc_bytealign (w[61], w[62], offset);
      w[61] = hc_bytealign (w[60], w[61], offset);
      w[60] = hc_bytealign (w[59], w[60], offset);
      w[59] = hc_bytealign (w[58], w[59], offset);
      w[58] = hc_bytealign (w[57], w[58], offset);
      w[57] = hc_bytealign (w[56], w[57], offset);
      w[56] = hc_bytealign (w[55], w[56], offset);
      w[55] = hc_bytealign (w[54], w[55], offset);
      w[54] = hc_bytealign (w[53], w[54], offset);
      w[53] = hc_bytealign (w[52], w[53], offset);
      w[52] = hc_bytealign (w[51], w[52], offset);
      w[51] = hc_bytealign (w[50], w[51], offset);
      w[50] = hc_bytealign (w[49], w[50], offset);
      w[49] = hc_bytealign (w[48], w[49], offset);
      w[48] = hc_bytealign (w[47], w[48], offset);
      w[47] = hc_bytealign (w[46], w[47], offset);
      w[46] = hc_bytealign (w[45], w[46], offset);
      w[45] = hc_bytealign (w[44], w[45], offset);
      w[44] = hc_bytealign (w[43], w[44], offset);
      w[43] = hc_bytealign (w[42], w[43], offset);
      w[42] = hc_bytealign (w[41], w[42], offset);
      w[41] = hc_bytealign (w[40], w[41], offset);
      w[40] = hc_bytealign (w[39], w[40], offset);
      w[39] = hc_bytealign (w[38], w[39], offset);
      w[38] = hc_bytealign (w[37], w[38], offset);
      w[37] = hc_bytealign (w[36], w[37], offset);
      w[36] = hc_bytealign (w[35], w[36], offset);
      w[35] = hc_bytealign (w[34], w[35], offset);
      w[34] = hc_bytealign (w[33], w[34], offset);
      w[33] = hc_bytealign (w[32], w[33], offset);
      w[32] = hc_bytealign (w[31], w[32], offset);
      w[31] = hc_bytealign (w[30], w[31], offset);
      w[30] = hc_bytealign (w[29], w[30], offset);
      w[29] = hc_bytealign (w[28], w[29], offset);
      w[28] = hc_bytealign (w[27], w[28], offset);
      w[27] = hc_bytealign (w[26], w[27], offset);
      w[26] = hc_bytealign (w[25], w[26], offset);
      w[25] = hc_bytealign (w[24], w[25], offset);
      w[24] = hc_bytealign (w[23], w[24], offset);
      w[23] = hc_bytealign (w[22], w[23], offset);
      w[22] = hc_bytealign (w[21], w[22], offset);
      w[21] = hc_bytealign (w[20], w[21], offset);
      w[20] = hc_bytealign (w[19], w[20], offset);
      w[19] = hc_bytealign (w[18], w[19], offset);
      w[18] = hc_bytealign (w[17], w[18], offset);
      w[17] = hc_bytealign (w[16], w[17], offset);
      w[16] = hc_bytealign (w[15], w[16], offset);
      w[15] = hc_bytealign (w[14], w[15], offset);
      w[14] = hc_bytealign (w[13], w[14], offset);
      w[13] = hc_bytealign (w[12], w[13], offset);
      w[12] = hc_bytealign (w[11], w[12], offset);
      w[11] = hc_bytealign (w[10], w[11], offset);
      w[10] = hc_bytealign (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign (w[61], w[62], offset);
      w[62] = hc_bytealign (w[60], w[61], offset);
      w[61] = hc_bytealign (w[59], w[60], offset);
      w[60] = hc_bytealign (w[58], w[59], offset);
      w[59] = hc_bytealign (w[57], w[58], offset);
      w[58] = hc_bytealign (w[56], w[57], offset);
      w[57] = hc_bytealign (w[55], w[56], offset);
      w[56] = hc_bytealign (w[54], w[55], offset);
      w[55] = hc_bytealign (w[53], w[54], offset);
      w[54] = hc_bytealign (w[52], w[53], offset);
      w[53] = hc_bytealign (w[51], w[52], offset);
      w[52] = hc_bytealign (w[50], w[51], offset);
      w[51] = hc_bytealign (w[49], w[50], offset);
      w[50] = hc_bytealign (w[48], w[49], offset);
      w[49] = hc_bytealign (w[47], w[48], offset);
      w[48] = hc_bytealign (w[46], w[47], offset);
      w[47] = hc_bytealign (w[45], w[46], offset);
      w[46] = hc_bytealign (w[44], w[45], offset);
      w[45] = hc_bytealign (w[43], w[44], offset);
      w[44] = hc_bytealign (w[42], w[43], offset);
      w[43] = hc_bytealign (w[41], w[42], offset);
      w[42] = hc_bytealign (w[40], w[41], offset);
      w[41] = hc_bytealign (w[39], w[40], offset);
      w[40] = hc_bytealign (w[38], w[39], offset);
      w[39] = hc_bytealign (w[37], w[38], offset);
      w[38] = hc_bytealign (w[36], w[37], offset);
      w[37] = hc_bytealign (w[35], w[36], offset);
      w[36] = hc_bytealign (w[34], w[35], offset);
      w[35] = hc_bytealign (w[33], w[34], offset);
      w[34] = hc_bytealign (w[32], w[33], offset);
      w[33] = hc_bytealign (w[31], w[32], offset);
      w[32] = hc_bytealign (w[30], w[31], offset);
      w[31] = hc_bytealign (w[29], w[30], offset);
      w[30] = hc_bytealign (w[28], w[29], offset);
      w[29] = hc_bytealign (w[27], w[28], offset);
      w[28] = hc_bytealign (w[26], w[27], offset);
      w[27] = hc_bytealign (w[25], w[26], offset);
      w[26] = hc_bytealign (w[24], w[25], offset);
      w[25] = hc_bytealign (w[23], w[24], offset);
      w[24] = hc_bytealign (w[22], w[23], offset);
      w[23] = hc_bytealign (w[21], w[22], offset);
      w[22] = hc_bytealign (w[20], w[21], offset);
      w[21] = hc_bytealign (w[19], w[20], offset);
      w[20] = hc_bytealign (w[18], w[19], offset);
      w[19] = hc_bytealign (w[17], w[18], offset);
      w[18] = hc_bytealign (w[16], w[17], offset);
      w[17] = hc_bytealign (w[15], w[16], offset);
      w[16] = hc_bytealign (w[14], w[15], offset);
      w[15] = hc_bytealign (w[13], w[14], offset);
      w[14] = hc_bytealign (w[12], w[13], offset);
      w[13] = hc_bytealign (w[11], w[12], offset);
      w[12] = hc_bytealign (w[10], w[11], offset);
      w[11] = hc_bytealign (w[ 9], w[10], offset);
      w[10] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign (w[60], w[61], offset);
      w[62] = hc_bytealign (w[59], w[60], offset);
      w[61] = hc_bytealign (w[58], w[59], offset);
      w[60] = hc_bytealign (w[57], w[58], offset);
      w[59] = hc_bytealign (w[56], w[57], offset);
      w[58] = hc_bytealign (w[55], w[56], offset);
      w[57] = hc_bytealign (w[54], w[55], offset);
      w[56] = hc_bytealign (w[53], w[54], offset);
      w[55] = hc_bytealign (w[52], w[53], offset);
      w[54] = hc_bytealign (w[51], w[52], offset);
      w[53] = hc_bytealign (w[50], w[51], offset);
      w[52] = hc_bytealign (w[49], w[50], offset);
      w[51] = hc_bytealign (w[48], w[49], offset);
      w[50] = hc_bytealign (w[47], w[48], offset);
      w[49] = hc_bytealign (w[46], w[47], offset);
      w[48] = hc_bytealign (w[45], w[46], offset);
      w[47] = hc_bytealign (w[44], w[45], offset);
      w[46] = hc_bytealign (w[43], w[44], offset);
      w[45] = hc_bytealign (w[42], w[43], offset);
      w[44] = hc_bytealign (w[41], w[42], offset);
      w[43] = hc_bytealign (w[40], w[41], offset);
      w[42] = hc_bytealign (w[39], w[40], offset);
      w[41] = hc_bytealign (w[38], w[39], offset);
      w[40] = hc_bytealign (w[37], w[38], offset);
      w[39] = hc_bytealign (w[36], w[37], offset);
      w[38] = hc_bytealign (w[35], w[36], offset);
      w[37] = hc_bytealign (w[34], w[35], offset);
      w[36] = hc_bytealign (w[33], w[34], offset);
      w[35] = hc_bytealign (w[32], w[33], offset);
      w[34] = hc_bytealign (w[31], w[32], offset);
      w[33] = hc_bytealign (w[30], w[31], offset);
      w[32] = hc_bytealign (w[29], w[30], offset);
      w[31] = hc_bytealign (w[28], w[29], offset);
      w[30] = hc_bytealign (w[27], w[28], offset);
      w[29] = hc_bytealign (w[26], w[27], offset);
      w[28] = hc_bytealign (w[25], w[26], offset);
      w[27] = hc_bytealign (w[24], w[25], offset);
      w[26] = hc_bytealign (w[23], w[24], offset);
      w[25] = hc_bytealign (w[22], w[23], offset);
      w[24] = hc_bytealign (w[21], w[22], offset);
      w[23] = hc_bytealign (w[20], w[21], offset);
      w[22] = hc_bytealign (w[19], w[20], offset);
      w[21] = hc_bytealign (w[18], w[19], offset);
      w[20] = hc_bytealign (w[17], w[18], offset);
      w[19] = hc_bytealign (w[16], w[17], offset);
      w[18] = hc_bytealign (w[15], w[16], offset);
      w[17] = hc_bytealign (w[14], w[15], offset);
      w[16] = hc_bytealign (w[13], w[14], offset);
      w[15] = hc_bytealign (w[12], w[13], offset);
      w[14] = hc_bytealign (w[11], w[12], offset);
      w[13] = hc_bytealign (w[10], w[11], offset);
      w[12] = hc_bytealign (w[ 9], w[10], offset);
      w[11] = hc_bytealign (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign (w[59], w[60], offset);
      w[62] = hc_bytealign (w[58], w[59], offset);
      w[61] = hc_bytealign (w[57], w[58], offset);
      w[60] = hc_bytealign (w[56], w[57], offset);
      w[59] = hc_bytealign (w[55], w[56], offset);
      w[58] = hc_bytealign (w[54], w[55], offset);
      w[57] = hc_bytealign (w[53], w[54], offset);
      w[56] = hc_bytealign (w[52], w[53], offset);
      w[55] = hc_bytealign (w[51], w[52], offset);
      w[54] = hc_bytealign (w[50], w[51], offset);
      w[53] = hc_bytealign (w[49], w[50], offset);
      w[52] = hc_bytealign (w[48], w[49], offset);
      w[51] = hc_bytealign (w[47], w[48], offset);
      w[50] = hc_bytealign (w[46], w[47], offset);
      w[49] = hc_bytealign (w[45], w[46], offset);
      w[48] = hc_bytealign (w[44], w[45], offset);
      w[47] = hc_bytealign (w[43], w[44], offset);
      w[46] = hc_bytealign (w[42], w[43], offset);
      w[45] = hc_bytealign (w[41], w[42], offset);
      w[44] = hc_bytealign (w[40], w[41], offset);
      w[43] = hc_bytealign (w[39], w[40], offset);
      w[42] = hc_bytealign (w[38], w[39], offset);
      w[41] = hc_bytealign (w[37], w[38], offset);
      w[40] = hc_bytealign (w[36], w[37], offset);
      w[39] = hc_bytealign (w[35], w[36], offset);
      w[38] = hc_bytealign (w[34], w[35], offset);
      w[37] = hc_bytealign (w[33], w[34], offset);
      w[36] = hc_bytealign (w[32], w[33], offset);
      w[35] = hc_bytealign (w[31], w[32], offset);
      w[34] = hc_bytealign (w[30], w[31], offset);
      w[33] = hc_bytealign (w[29], w[30], offset);
      w[32] = hc_bytealign (w[28], w[29], offset);
      w[31] = hc_bytealign (w[27], w[28], offset);
      w[30] = hc_bytealign (w[26], w[27], offset);
      w[29] = hc_bytealign (w[25], w[26], offset);
      w[28] = hc_bytealign (w[24], w[25], offset);
      w[27] = hc_bytealign (w[23], w[24], offset);
      w[26] = hc_bytealign (w[22], w[23], offset);
      w[25] = hc_bytealign (w[21], w[22], offset);
      w[24] = hc_bytealign (w[20], w[21], offset);
      w[23] = hc_bytealign (w[19], w[20], offset);
      w[22] = hc_bytealign (w[18], w[19], offset);
      w[21] = hc_bytealign (w[17], w[18], offset);
      w[20] = hc_bytealign (w[16], w[17], offset);
      w[19] = hc_bytealign (w[15], w[16], offset);
      w[18] = hc_bytealign (w[14], w[15], offset);
      w[17] = hc_bytealign (w[13], w[14], offset);
      w[16] = hc_bytealign (w[12], w[13], offset);
      w[15] = hc_bytealign (w[11], w[12], offset);
      w[14] = hc_bytealign (w[10], w[11], offset);
      w[13] = hc_bytealign (w[ 9], w[10], offset);
      w[12] = hc_bytealign (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign (w[58], w[59], offset);
      w[62] = hc_bytealign (w[57], w[58], offset);
      w[61] = hc_bytealign (w[56], w[57], offset);
      w[60] = hc_bytealign (w[55], w[56], offset);
      w[59] = hc_bytealign (w[54], w[55], offset);
      w[58] = hc_bytealign (w[53], w[54], offset);
      w[57] = hc_bytealign (w[52], w[53], offset);
      w[56] = hc_bytealign (w[51], w[52], offset);
      w[55] = hc_bytealign (w[50], w[51], offset);
      w[54] = hc_bytealign (w[49], w[50], offset);
      w[53] = hc_bytealign (w[48], w[49], offset);
      w[52] = hc_bytealign (w[47], w[48], offset);
      w[51] = hc_bytealign (w[46], w[47], offset);
      w[50] = hc_bytealign (w[45], w[46], offset);
      w[49] = hc_bytealign (w[44], w[45], offset);
      w[48] = hc_bytealign (w[43], w[44], offset);
      w[47] = hc_bytealign (w[42], w[43], offset);
      w[46] = hc_bytealign (w[41], w[42], offset);
      w[45] = hc_bytealign (w[40], w[41], offset);
      w[44] = hc_bytealign (w[39], w[40], offset);
      w[43] = hc_bytealign (w[38], w[39], offset);
      w[42] = hc_bytealign (w[37], w[38], offset);
      w[41] = hc_bytealign (w[36], w[37], offset);
      w[40] = hc_bytealign (w[35], w[36], offset);
      w[39] = hc_bytealign (w[34], w[35], offset);
      w[38] = hc_bytealign (w[33], w[34], offset);
      w[37] = hc_bytealign (w[32], w[33], offset);
      w[36] = hc_bytealign (w[31], w[32], offset);
      w[35] = hc_bytealign (w[30], w[31], offset);
      w[34] = hc_bytealign (w[29], w[30], offset);
      w[33] = hc_bytealign (w[28], w[29], offset);
      w[32] = hc_bytealign (w[27], w[28], offset);
      w[31] = hc_bytealign (w[26], w[27], offset);
      w[30] = hc_bytealign (w[25], w[26], offset);
      w[29] = hc_bytealign (w[24], w[25], offset);
      w[28] = hc_bytealign (w[23], w[24], offset);
      w[27] = hc_bytealign (w[22], w[23], offset);
      w[26] = hc_bytealign (w[21], w[22], offset);
      w[25] = hc_bytealign (w[20], w[21], offset);
      w[24] = hc_bytealign (w[19], w[20], offset);
      w[23] = hc_bytealign (w[18], w[19], offset);
      w[22] = hc_bytealign (w[17], w[18], offset);
      w[21] = hc_bytealign (w[16], w[17], offset);
      w[20] = hc_bytealign (w[15], w[16], offset);
      w[19] = hc_bytealign (w[14], w[15], offset);
      w[18] = hc_bytealign (w[13], w[14], offset);
      w[17] = hc_bytealign (w[12], w[13], offset);
      w[16] = hc_bytealign (w[11], w[12], offset);
      w[15] = hc_bytealign (w[10], w[11], offset);
      w[14] = hc_bytealign (w[ 9], w[10], offset);
      w[13] = hc_bytealign (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign (w[57], w[58], offset);
      w[62] = hc_bytealign (w[56], w[57], offset);
      w[61] = hc_bytealign (w[55], w[56], offset);
      w[60] = hc_bytealign (w[54], w[55], offset);
      w[59] = hc_bytealign (w[53], w[54], offset);
      w[58] = hc_bytealign (w[52], w[53], offset);
      w[57] = hc_bytealign (w[51], w[52], offset);
      w[56] = hc_bytealign (w[50], w[51], offset);
      w[55] = hc_bytealign (w[49], w[50], offset);
      w[54] = hc_bytealign (w[48], w[49], offset);
      w[53] = hc_bytealign (w[47], w[48], offset);
      w[52] = hc_bytealign (w[46], w[47], offset);
      w[51] = hc_bytealign (w[45], w[46], offset);
      w[50] = hc_bytealign (w[44], w[45], offset);
      w[49] = hc_bytealign (w[43], w[44], offset);
      w[48] = hc_bytealign (w[42], w[43], offset);
      w[47] = hc_bytealign (w[41], w[42], offset);
      w[46] = hc_bytealign (w[40], w[41], offset);
      w[45] = hc_bytealign (w[39], w[40], offset);
      w[44] = hc_bytealign (w[38], w[39], offset);
      w[43] = hc_bytealign (w[37], w[38], offset);
      w[42] = hc_bytealign (w[36], w[37], offset);
      w[41] = hc_bytealign (w[35], w[36], offset);
      w[40] = hc_bytealign (w[34], w[35], offset);
      w[39] = hc_bytealign (w[33], w[34], offset);
      w[38] = hc_bytealign (w[32], w[33], offset);
      w[37] = hc_bytealign (w[31], w[32], offset);
      w[36] = hc_bytealign (w[30], w[31], offset);
      w[35] = hc_bytealign (w[29], w[30], offset);
      w[34] = hc_bytealign (w[28], w[29], offset);
      w[33] = hc_bytealign (w[27], w[28], offset);
      w[32] = hc_bytealign (w[26], w[27], offset);
      w[31] = hc_bytealign (w[25], w[26], offset);
      w[30] = hc_bytealign (w[24], w[25], offset);
      w[29] = hc_bytealign (w[23], w[24], offset);
      w[28] = hc_bytealign (w[22], w[23], offset);
      w[27] = hc_bytealign (w[21], w[22], offset);
      w[26] = hc_bytealign (w[20], w[21], offset);
      w[25] = hc_bytealign (w[19], w[20], offset);
      w[24] = hc_bytealign (w[18], w[19], offset);
      w[23] = hc_bytealign (w[17], w[18], offset);
      w[22] = hc_bytealign (w[16], w[17], offset);
      w[21] = hc_bytealign (w[15], w[16], offset);
      w[20] = hc_bytealign (w[14], w[15], offset);
      w[19] = hc_bytealign (w[13], w[14], offset);
      w[18] = hc_bytealign (w[12], w[13], offset);
      w[17] = hc_bytealign (w[11], w[12], offset);
      w[16] = hc_bytealign (w[10], w[11], offset);
      w[15] = hc_bytealign (w[ 9], w[10], offset);
      w[14] = hc_bytealign (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign (w[56], w[57], offset);
      w[62] = hc_bytealign (w[55], w[56], offset);
      w[61] = hc_bytealign (w[54], w[55], offset);
      w[60] = hc_bytealign (w[53], w[54], offset);
      w[59] = hc_bytealign (w[52], w[53], offset);
      w[58] = hc_bytealign (w[51], w[52], offset);
      w[57] = hc_bytealign (w[50], w[51], offset);
      w[56] = hc_bytealign (w[49], w[50], offset);
      w[55] = hc_bytealign (w[48], w[49], offset);
      w[54] = hc_bytealign (w[47], w[48], offset);
      w[53] = hc_bytealign (w[46], w[47], offset);
      w[52] = hc_bytealign (w[45], w[46], offset);
      w[51] = hc_bytealign (w[44], w[45], offset);
      w[50] = hc_bytealign (w[43], w[44], offset);
      w[49] = hc_bytealign (w[42], w[43], offset);
      w[48] = hc_bytealign (w[41], w[42], offset);
      w[47] = hc_bytealign (w[40], w[41], offset);
      w[46] = hc_bytealign (w[39], w[40], offset);
      w[45] = hc_bytealign (w[38], w[39], offset);
      w[44] = hc_bytealign (w[37], w[38], offset);
      w[43] = hc_bytealign (w[36], w[37], offset);
      w[42] = hc_bytealign (w[35], w[36], offset);
      w[41] = hc_bytealign (w[34], w[35], offset);
      w[40] = hc_bytealign (w[33], w[34], offset);
      w[39] = hc_bytealign (w[32], w[33], offset);
      w[38] = hc_bytealign (w[31], w[32], offset);
      w[37] = hc_bytealign (w[30], w[31], offset);
      w[36] = hc_bytealign (w[29], w[30], offset);
      w[35] = hc_bytealign (w[28], w[29], offset);
      w[34] = hc_bytealign (w[27], w[28], offset);
      w[33] = hc_bytealign (w[26], w[27], offset);
      w[32] = hc_bytealign (w[25], w[26], offset);
      w[31] = hc_bytealign (w[24], w[25], offset);
      w[30] = hc_bytealign (w[23], w[24], offset);
      w[29] = hc_bytealign (w[22], w[23], offset);
      w[28] = hc_bytealign (w[21], w[22], offset);
      w[27] = hc_bytealign (w[20], w[21], offset);
      w[26] = hc_bytealign (w[19], w[20], offset);
      w[25] = hc_bytealign (w[18], w[19], offset);
      w[24] = hc_bytealign (w[17], w[18], offset);
      w[23] = hc_bytealign (w[16], w[17], offset);
      w[22] = hc_bytealign (w[15], w[16], offset);
      w[21] = hc_bytealign (w[14], w[15], offset);
      w[20] = hc_bytealign (w[13], w[14], offset);
      w[19] = hc_bytealign (w[12], w[13], offset);
      w[18] = hc_bytealign (w[11], w[12], offset);
      w[17] = hc_bytealign (w[10], w[11], offset);
      w[16] = hc_bytealign (w[ 9], w[10], offset);
      w[15] = hc_bytealign (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign (w[55], w[56], offset);
      w[62] = hc_bytealign (w[54], w[55], offset);
      w[61] = hc_bytealign (w[53], w[54], offset);
      w[60] = hc_bytealign (w[52], w[53], offset);
      w[59] = hc_bytealign (w[51], w[52], offset);
      w[58] = hc_bytealign (w[50], w[51], offset);
      w[57] = hc_bytealign (w[49], w[50], offset);
      w[56] = hc_bytealign (w[48], w[49], offset);
      w[55] = hc_bytealign (w[47], w[48], offset);
      w[54] = hc_bytealign (w[46], w[47], offset);
      w[53] = hc_bytealign (w[45], w[46], offset);
      w[52] = hc_bytealign (w[44], w[45], offset);
      w[51] = hc_bytealign (w[43], w[44], offset);
      w[50] = hc_bytealign (w[42], w[43], offset);
      w[49] = hc_bytealign (w[41], w[42], offset);
      w[48] = hc_bytealign (w[40], w[41], offset);
      w[47] = hc_bytealign (w[39], w[40], offset);
      w[46] = hc_bytealign (w[38], w[39], offset);
      w[45] = hc_bytealign (w[37], w[38], offset);
      w[44] = hc_bytealign (w[36], w[37], offset);
      w[43] = hc_bytealign (w[35], w[36], offset);
      w[42] = hc_bytealign (w[34], w[35], offset);
      w[41] = hc_bytealign (w[33], w[34], offset);
      w[40] = hc_bytealign (w[32], w[33], offset);
      w[39] = hc_bytealign (w[31], w[32], offset);
      w[38] = hc_bytealign (w[30], w[31], offset);
      w[37] = hc_bytealign (w[29], w[30], offset);
      w[36] = hc_bytealign (w[28], w[29], offset);
      w[35] = hc_bytealign (w[27], w[28], offset);
      w[34] = hc_bytealign (w[26], w[27], offset);
      w[33] = hc_bytealign (w[25], w[26], offset);
      w[32] = hc_bytealign (w[24], w[25], offset);
      w[31] = hc_bytealign (w[23], w[24], offset);
      w[30] = hc_bytealign (w[22], w[23], offset);
      w[29] = hc_bytealign (w[21], w[22], offset);
      w[28] = hc_bytealign (w[20], w[21], offset);
      w[27] = hc_bytealign (w[19], w[20], offset);
      w[26] = hc_bytealign (w[18], w[19], offset);
      w[25] = hc_bytealign (w[17], w[18], offset);
      w[24] = hc_bytealign (w[16], w[17], offset);
      w[23] = hc_bytealign (w[15], w[16], offset);
      w[22] = hc_bytealign (w[14], w[15], offset);
      w[21] = hc_bytealign (w[13], w[14], offset);
      w[20] = hc_bytealign (w[12], w[13], offset);
      w[19] = hc_bytealign (w[11], w[12], offset);
      w[18] = hc_bytealign (w[10], w[11], offset);
      w[17] = hc_bytealign (w[ 9], w[10], offset);
      w[16] = hc_bytealign (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign (w[54], w[55], offset);
      w[62] = hc_bytealign (w[53], w[54], offset);
      w[61] = hc_bytealign (w[52], w[53], offset);
      w[60] = hc_bytealign (w[51], w[52], offset);
      w[59] = hc_bytealign (w[50], w[51], offset);
      w[58] = hc_bytealign (w[49], w[50], offset);
      w[57] = hc_bytealign (w[48], w[49], offset);
      w[56] = hc_bytealign (w[47], w[48], offset);
      w[55] = hc_bytealign (w[46], w[47], offset);
      w[54] = hc_bytealign (w[45], w[46], offset);
      w[53] = hc_bytealign (w[44], w[45], offset);
      w[52] = hc_bytealign (w[43], w[44], offset);
      w[51] = hc_bytealign (w[42], w[43], offset);
      w[50] = hc_bytealign (w[41], w[42], offset);
      w[49] = hc_bytealign (w[40], w[41], offset);
      w[48] = hc_bytealign (w[39], w[40], offset);
      w[47] = hc_bytealign (w[38], w[39], offset);
      w[46] = hc_bytealign (w[37], w[38], offset);
      w[45] = hc_bytealign (w[36], w[37], offset);
      w[44] = hc_bytealign (w[35], w[36], offset);
      w[43] = hc_bytealign (w[34], w[35], offset);
      w[42] = hc_bytealign (w[33], w[34], offset);
      w[41] = hc_bytealign (w[32], w[33], offset);
      w[40] = hc_bytealign (w[31], w[32], offset);
      w[39] = hc_bytealign (w[30], w[31], offset);
      w[38] = hc_bytealign (w[29], w[30], offset);
      w[37] = hc_bytealign (w[28], w[29], offset);
      w[36] = hc_bytealign (w[27], w[28], offset);
      w[35] = hc_bytealign (w[26], w[27], offset);
      w[34] = hc_bytealign (w[25], w[26], offset);
      w[33] = hc_bytealign (w[24], w[25], offset);
      w[32] = hc_bytealign (w[23], w[24], offset);
      w[31] = hc_bytealign (w[22], w[23], offset);
      w[30] = hc_bytealign (w[21], w[22], offset);
      w[29] = hc_bytealign (w[20], w[21], offset);
      w[28] = hc_bytealign (w[19], w[20], offset);
      w[27] = hc_bytealign (w[18], w[19], offset);
      w[26] = hc_bytealign (w[17], w[18], offset);
      w[25] = hc_bytealign (w[16], w[17], offset);
      w[24] = hc_bytealign (w[15], w[16], offset);
      w[23] = hc_bytealign (w[14], w[15], offset);
      w[22] = hc_bytealign (w[13], w[14], offset);
      w[21] = hc_bytealign (w[12], w[13], offset);
      w[20] = hc_bytealign (w[11], w[12], offset);
      w[19] = hc_bytealign (w[10], w[11], offset);
      w[18] = hc_bytealign (w[ 9], w[10], offset);
      w[17] = hc_bytealign (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign (w[53], w[54], offset);
      w[62] = hc_bytealign (w[52], w[53], offset);
      w[61] = hc_bytealign (w[51], w[52], offset);
      w[60] = hc_bytealign (w[50], w[51], offset);
      w[59] = hc_bytealign (w[49], w[50], offset);
      w[58] = hc_bytealign (w[48], w[49], offset);
      w[57] = hc_bytealign (w[47], w[48], offset);
      w[56] = hc_bytealign (w[46], w[47], offset);
      w[55] = hc_bytealign (w[45], w[46], offset);
      w[54] = hc_bytealign (w[44], w[45], offset);
      w[53] = hc_bytealign (w[43], w[44], offset);
      w[52] = hc_bytealign (w[42], w[43], offset);
      w[51] = hc_bytealign (w[41], w[42], offset);
      w[50] = hc_bytealign (w[40], w[41], offset);
      w[49] = hc_bytealign (w[39], w[40], offset);
      w[48] = hc_bytealign (w[38], w[39], offset);
      w[47] = hc_bytealign (w[37], w[38], offset);
      w[46] = hc_bytealign (w[36], w[37], offset);
      w[45] = hc_bytealign (w[35], w[36], offset);
      w[44] = hc_bytealign (w[34], w[35], offset);
      w[43] = hc_bytealign (w[33], w[34], offset);
      w[42] = hc_bytealign (w[32], w[33], offset);
      w[41] = hc_bytealign (w[31], w[32], offset);
      w[40] = hc_bytealign (w[30], w[31], offset);
      w[39] = hc_bytealign (w[29], w[30], offset);
      w[38] = hc_bytealign (w[28], w[29], offset);
      w[37] = hc_bytealign (w[27], w[28], offset);
      w[36] = hc_bytealign (w[26], w[27], offset);
      w[35] = hc_bytealign (w[25], w[26], offset);
      w[34] = hc_bytealign (w[24], w[25], offset);
      w[33] = hc_bytealign (w[23], w[24], offset);
      w[32] = hc_bytealign (w[22], w[23], offset);
      w[31] = hc_bytealign (w[21], w[22], offset);
      w[30] = hc_bytealign (w[20], w[21], offset);
      w[29] = hc_bytealign (w[19], w[20], offset);
      w[28] = hc_bytealign (w[18], w[19], offset);
      w[27] = hc_bytealign (w[17], w[18], offset);
      w[26] = hc_bytealign (w[16], w[17], offset);
      w[25] = hc_bytealign (w[15], w[16], offset);
      w[24] = hc_bytealign (w[14], w[15], offset);
      w[23] = hc_bytealign (w[13], w[14], offset);
      w[22] = hc_bytealign (w[12], w[13], offset);
      w[21] = hc_bytealign (w[11], w[12], offset);
      w[20] = hc_bytealign (w[10], w[11], offset);
      w[19] = hc_bytealign (w[ 9], w[10], offset);
      w[18] = hc_bytealign (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign (w[52], w[53], offset);
      w[62] = hc_bytealign (w[51], w[52], offset);
      w[61] = hc_bytealign (w[50], w[51], offset);
      w[60] = hc_bytealign (w[49], w[50], offset);
      w[59] = hc_bytealign (w[48], w[49], offset);
      w[58] = hc_bytealign (w[47], w[48], offset);
      w[57] = hc_bytealign (w[46], w[47], offset);
      w[56] = hc_bytealign (w[45], w[46], offset);
      w[55] = hc_bytealign (w[44], w[45], offset);
      w[54] = hc_bytealign (w[43], w[44], offset);
      w[53] = hc_bytealign (w[42], w[43], offset);
      w[52] = hc_bytealign (w[41], w[42], offset);
      w[51] = hc_bytealign (w[40], w[41], offset);
      w[50] = hc_bytealign (w[39], w[40], offset);
      w[49] = hc_bytealign (w[38], w[39], offset);
      w[48] = hc_bytealign (w[37], w[38], offset);
      w[47] = hc_bytealign (w[36], w[37], offset);
      w[46] = hc_bytealign (w[35], w[36], offset);
      w[45] = hc_bytealign (w[34], w[35], offset);
      w[44] = hc_bytealign (w[33], w[34], offset);
      w[43] = hc_bytealign (w[32], w[33], offset);
      w[42] = hc_bytealign (w[31], w[32], offset);
      w[41] = hc_bytealign (w[30], w[31], offset);
      w[40] = hc_bytealign (w[29], w[30], offset);
      w[39] = hc_bytealign (w[28], w[29], offset);
      w[38] = hc_bytealign (w[27], w[28], offset);
      w[37] = hc_bytealign (w[26], w[27], offset);
      w[36] = hc_bytealign (w[25], w[26], offset);
      w[35] = hc_bytealign (w[24], w[25], offset);
      w[34] = hc_bytealign (w[23], w[24], offset);
      w[33] = hc_bytealign (w[22], w[23], offset);
      w[32] = hc_bytealign (w[21], w[22], offset);
      w[31] = hc_bytealign (w[20], w[21], offset);
      w[30] = hc_bytealign (w[19], w[20], offset);
      w[29] = hc_bytealign (w[18], w[19], offset);
      w[28] = hc_bytealign (w[17], w[18], offset);
      w[27] = hc_bytealign (w[16], w[17], offset);
      w[26] = hc_bytealign (w[15], w[16], offset);
      w[25] = hc_bytealign (w[14], w[15], offset);
      w[24] = hc_bytealign (w[13], w[14], offset);
      w[23] = hc_bytealign (w[12], w[13], offset);
      w[22] = hc_bytealign (w[11], w[12], offset);
      w[21] = hc_bytealign (w[10], w[11], offset);
      w[20] = hc_bytealign (w[ 9], w[10], offset);
      w[19] = hc_bytealign (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign (w[51], w[52], offset);
      w[62] = hc_bytealign (w[50], w[51], offset);
      w[61] = hc_bytealign (w[49], w[50], offset);
      w[60] = hc_bytealign (w[48], w[49], offset);
      w[59] = hc_bytealign (w[47], w[48], offset);
      w[58] = hc_bytealign (w[46], w[47], offset);
      w[57] = hc_bytealign (w[45], w[46], offset);
      w[56] = hc_bytealign (w[44], w[45], offset);
      w[55] = hc_bytealign (w[43], w[44], offset);
      w[54] = hc_bytealign (w[42], w[43], offset);
      w[53] = hc_bytealign (w[41], w[42], offset);
      w[52] = hc_bytealign (w[40], w[41], offset);
      w[51] = hc_bytealign (w[39], w[40], offset);
      w[50] = hc_bytealign (w[38], w[39], offset);
      w[49] = hc_bytealign (w[37], w[38], offset);
      w[48] = hc_bytealign (w[36], w[37], offset);
      w[47] = hc_bytealign (w[35], w[36], offset);
      w[46] = hc_bytealign (w[34], w[35], offset);
      w[45] = hc_bytealign (w[33], w[34], offset);
      w[44] = hc_bytealign (w[32], w[33], offset);
      w[43] = hc_bytealign (w[31], w[32], offset);
      w[42] = hc_bytealign (w[30], w[31], offset);
      w[41] = hc_bytealign (w[29], w[30], offset);
      w[40] = hc_bytealign (w[28], w[29], offset);
      w[39] = hc_bytealign (w[27], w[28], offset);
      w[38] = hc_bytealign (w[26], w[27], offset);
      w[37] = hc_bytealign (w[25], w[26], offset);
      w[36] = hc_bytealign (w[24], w[25], offset);
      w[35] = hc_bytealign (w[23], w[24], offset);
      w[34] = hc_bytealign (w[22], w[23], offset);
      w[33] = hc_bytealign (w[21], w[22], offset);
      w[32] = hc_bytealign (w[20], w[21], offset);
      w[31] = hc_bytealign (w[19], w[20], offset);
      w[30] = hc_bytealign (w[18], w[19], offset);
      w[29] = hc_bytealign (w[17], w[18], offset);
      w[28] = hc_bytealign (w[16], w[17], offset);
      w[27] = hc_bytealign (w[15], w[16], offset);
      w[26] = hc_bytealign (w[14], w[15], offset);
      w[25] = hc_bytealign (w[13], w[14], offset);
      w[24] = hc_bytealign (w[12], w[13], offset);
      w[23] = hc_bytealign (w[11], w[12], offset);
      w[22] = hc_bytealign (w[10], w[11], offset);
      w[21] = hc_bytealign (w[ 9], w[10], offset);
      w[20] = hc_bytealign (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign (w[50], w[51], offset);
      w[62] = hc_bytealign (w[49], w[50], offset);
      w[61] = hc_bytealign (w[48], w[49], offset);
      w[60] = hc_bytealign (w[47], w[48], offset);
      w[59] = hc_bytealign (w[46], w[47], offset);
      w[58] = hc_bytealign (w[45], w[46], offset);
      w[57] = hc_bytealign (w[44], w[45], offset);
      w[56] = hc_bytealign (w[43], w[44], offset);
      w[55] = hc_bytealign (w[42], w[43], offset);
      w[54] = hc_bytealign (w[41], w[42], offset);
      w[53] = hc_bytealign (w[40], w[41], offset);
      w[52] = hc_bytealign (w[39], w[40], offset);
      w[51] = hc_bytealign (w[38], w[39], offset);
      w[50] = hc_bytealign (w[37], w[38], offset);
      w[49] = hc_bytealign (w[36], w[37], offset);
      w[48] = hc_bytealign (w[35], w[36], offset);
      w[47] = hc_bytealign (w[34], w[35], offset);
      w[46] = hc_bytealign (w[33], w[34], offset);
      w[45] = hc_bytealign (w[32], w[33], offset);
      w[44] = hc_bytealign (w[31], w[32], offset);
      w[43] = hc_bytealign (w[30], w[31], offset);
      w[42] = hc_bytealign (w[29], w[30], offset);
      w[41] = hc_bytealign (w[28], w[29], offset);
      w[40] = hc_bytealign (w[27], w[28], offset);
      w[39] = hc_bytealign (w[26], w[27], offset);
      w[38] = hc_bytealign (w[25], w[26], offset);
      w[37] = hc_bytealign (w[24], w[25], offset);
      w[36] = hc_bytealign (w[23], w[24], offset);
      w[35] = hc_bytealign (w[22], w[23], offset);
      w[34] = hc_bytealign (w[21], w[22], offset);
      w[33] = hc_bytealign (w[20], w[21], offset);
      w[32] = hc_bytealign (w[19], w[20], offset);
      w[31] = hc_bytealign (w[18], w[19], offset);
      w[30] = hc_bytealign (w[17], w[18], offset);
      w[29] = hc_bytealign (w[16], w[17], offset);
      w[28] = hc_bytealign (w[15], w[16], offset);
      w[27] = hc_bytealign (w[14], w[15], offset);
      w[26] = hc_bytealign (w[13], w[14], offset);
      w[25] = hc_bytealign (w[12], w[13], offset);
      w[24] = hc_bytealign (w[11], w[12], offset);
      w[23] = hc_bytealign (w[10], w[11], offset);
      w[22] = hc_bytealign (w[ 9], w[10], offset);
      w[21] = hc_bytealign (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign (w[49], w[50], offset);
      w[62] = hc_bytealign (w[48], w[49], offset);
      w[61] = hc_bytealign (w[47], w[48], offset);
      w[60] = hc_bytealign (w[46], w[47], offset);
      w[59] = hc_bytealign (w[45], w[46], offset);
      w[58] = hc_bytealign (w[44], w[45], offset);
      w[57] = hc_bytealign (w[43], w[44], offset);
      w[56] = hc_bytealign (w[42], w[43], offset);
      w[55] = hc_bytealign (w[41], w[42], offset);
      w[54] = hc_bytealign (w[40], w[41], offset);
      w[53] = hc_bytealign (w[39], w[40], offset);
      w[52] = hc_bytealign (w[38], w[39], offset);
      w[51] = hc_bytealign (w[37], w[38], offset);
      w[50] = hc_bytealign (w[36], w[37], offset);
      w[49] = hc_bytealign (w[35], w[36], offset);
      w[48] = hc_bytealign (w[34], w[35], offset);
      w[47] = hc_bytealign (w[33], w[34], offset);
      w[46] = hc_bytealign (w[32], w[33], offset);
      w[45] = hc_bytealign (w[31], w[32], offset);
      w[44] = hc_bytealign (w[30], w[31], offset);
      w[43] = hc_bytealign (w[29], w[30], offset);
      w[42] = hc_bytealign (w[28], w[29], offset);
      w[41] = hc_bytealign (w[27], w[28], offset);
      w[40] = hc_bytealign (w[26], w[27], offset);
      w[39] = hc_bytealign (w[25], w[26], offset);
      w[38] = hc_bytealign (w[24], w[25], offset);
      w[37] = hc_bytealign (w[23], w[24], offset);
      w[36] = hc_bytealign (w[22], w[23], offset);
      w[35] = hc_bytealign (w[21], w[22], offset);
      w[34] = hc_bytealign (w[20], w[21], offset);
      w[33] = hc_bytealign (w[19], w[20], offset);
      w[32] = hc_bytealign (w[18], w[19], offset);
      w[31] = hc_bytealign (w[17], w[18], offset);
      w[30] = hc_bytealign (w[16], w[17], offset);
      w[29] = hc_bytealign (w[15], w[16], offset);
      w[28] = hc_bytealign (w[14], w[15], offset);
      w[27] = hc_bytealign (w[13], w[14], offset);
      w[26] = hc_bytealign (w[12], w[13], offset);
      w[25] = hc_bytealign (w[11], w[12], offset);
      w[24] = hc_bytealign (w[10], w[11], offset);
      w[23] = hc_bytealign (w[ 9], w[10], offset);
      w[22] = hc_bytealign (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign (w[48], w[49], offset);
      w[62] = hc_bytealign (w[47], w[48], offset);
      w[61] = hc_bytealign (w[46], w[47], offset);
      w[60] = hc_bytealign (w[45], w[46], offset);
      w[59] = hc_bytealign (w[44], w[45], offset);
      w[58] = hc_bytealign (w[43], w[44], offset);
      w[57] = hc_bytealign (w[42], w[43], offset);
      w[56] = hc_bytealign (w[41], w[42], offset);
      w[55] = hc_bytealign (w[40], w[41], offset);
      w[54] = hc_bytealign (w[39], w[40], offset);
      w[53] = hc_bytealign (w[38], w[39], offset);
      w[52] = hc_bytealign (w[37], w[38], offset);
      w[51] = hc_bytealign (w[36], w[37], offset);
      w[50] = hc_bytealign (w[35], w[36], offset);
      w[49] = hc_bytealign (w[34], w[35], offset);
      w[48] = hc_bytealign (w[33], w[34], offset);
      w[47] = hc_bytealign (w[32], w[33], offset);
      w[46] = hc_bytealign (w[31], w[32], offset);
      w[45] = hc_bytealign (w[30], w[31], offset);
      w[44] = hc_bytealign (w[29], w[30], offset);
      w[43] = hc_bytealign (w[28], w[29], offset);
      w[42] = hc_bytealign (w[27], w[28], offset);
      w[41] = hc_bytealign (w[26], w[27], offset);
      w[40] = hc_bytealign (w[25], w[26], offset);
      w[39] = hc_bytealign (w[24], w[25], offset);
      w[38] = hc_bytealign (w[23], w[24], offset);
      w[37] = hc_bytealign (w[22], w[23], offset);
      w[36] = hc_bytealign (w[21], w[22], offset);
      w[35] = hc_bytealign (w[20], w[21], offset);
      w[34] = hc_bytealign (w[19], w[20], offset);
      w[33] = hc_bytealign (w[18], w[19], offset);
      w[32] = hc_bytealign (w[17], w[18], offset);
      w[31] = hc_bytealign (w[16], w[17], offset);
      w[30] = hc_bytealign (w[15], w[16], offset);
      w[29] = hc_bytealign (w[14], w[15], offset);
      w[28] = hc_bytealign (w[13], w[14], offset);
      w[27] = hc_bytealign (w[12], w[13], offset);
      w[26] = hc_bytealign (w[11], w[12], offset);
      w[25] = hc_bytealign (w[10], w[11], offset);
      w[24] = hc_bytealign (w[ 9], w[10], offset);
      w[23] = hc_bytealign (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign (w[47], w[48], offset);
      w[62] = hc_bytealign (w[46], w[47], offset);
      w[61] = hc_bytealign (w[45], w[46], offset);
      w[60] = hc_bytealign (w[44], w[45], offset);
      w[59] = hc_bytealign (w[43], w[44], offset);
      w[58] = hc_bytealign (w[42], w[43], offset);
      w[57] = hc_bytealign (w[41], w[42], offset);
      w[56] = hc_bytealign (w[40], w[41], offset);
      w[55] = hc_bytealign (w[39], w[40], offset);
      w[54] = hc_bytealign (w[38], w[39], offset);
      w[53] = hc_bytealign (w[37], w[38], offset);
      w[52] = hc_bytealign (w[36], w[37], offset);
      w[51] = hc_bytealign (w[35], w[36], offset);
      w[50] = hc_bytealign (w[34], w[35], offset);
      w[49] = hc_bytealign (w[33], w[34], offset);
      w[48] = hc_bytealign (w[32], w[33], offset);
      w[47] = hc_bytealign (w[31], w[32], offset);
      w[46] = hc_bytealign (w[30], w[31], offset);
      w[45] = hc_bytealign (w[29], w[30], offset);
      w[44] = hc_bytealign (w[28], w[29], offset);
      w[43] = hc_bytealign (w[27], w[28], offset);
      w[42] = hc_bytealign (w[26], w[27], offset);
      w[41] = hc_bytealign (w[25], w[26], offset);
      w[40] = hc_bytealign (w[24], w[25], offset);
      w[39] = hc_bytealign (w[23], w[24], offset);
      w[38] = hc_bytealign (w[22], w[23], offset);
      w[37] = hc_bytealign (w[21], w[22], offset);
      w[36] = hc_bytealign (w[20], w[21], offset);
      w[35] = hc_bytealign (w[19], w[20], offset);
      w[34] = hc_bytealign (w[18], w[19], offset);
      w[33] = hc_bytealign (w[17], w[18], offset);
      w[32] = hc_bytealign (w[16], w[17], offset);
      w[31] = hc_bytealign (w[15], w[16], offset);
      w[30] = hc_bytealign (w[14], w[15], offset);
      w[29] = hc_bytealign (w[13], w[14], offset);
      w[28] = hc_bytealign (w[12], w[13], offset);
      w[27] = hc_bytealign (w[11], w[12], offset);
      w[26] = hc_bytealign (w[10], w[11], offset);
      w[25] = hc_bytealign (w[ 9], w[10], offset);
      w[24] = hc_bytealign (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign (w[46], w[47], offset);
      w[62] = hc_bytealign (w[45], w[46], offset);
      w[61] = hc_bytealign (w[44], w[45], offset);
      w[60] = hc_bytealign (w[43], w[44], offset);
      w[59] = hc_bytealign (w[42], w[43], offset);
      w[58] = hc_bytealign (w[41], w[42], offset);
      w[57] = hc_bytealign (w[40], w[41], offset);
      w[56] = hc_bytealign (w[39], w[40], offset);
      w[55] = hc_bytealign (w[38], w[39], offset);
      w[54] = hc_bytealign (w[37], w[38], offset);
      w[53] = hc_bytealign (w[36], w[37], offset);
      w[52] = hc_bytealign (w[35], w[36], offset);
      w[51] = hc_bytealign (w[34], w[35], offset);
      w[50] = hc_bytealign (w[33], w[34], offset);
      w[49] = hc_bytealign (w[32], w[33], offset);
      w[48] = hc_bytealign (w[31], w[32], offset);
      w[47] = hc_bytealign (w[30], w[31], offset);
      w[46] = hc_bytealign (w[29], w[30], offset);
      w[45] = hc_bytealign (w[28], w[29], offset);
      w[44] = hc_bytealign (w[27], w[28], offset);
      w[43] = hc_bytealign (w[26], w[27], offset);
      w[42] = hc_bytealign (w[25], w[26], offset);
      w[41] = hc_bytealign (w[24], w[25], offset);
      w[40] = hc_bytealign (w[23], w[24], offset);
      w[39] = hc_bytealign (w[22], w[23], offset);
      w[38] = hc_bytealign (w[21], w[22], offset);
      w[37] = hc_bytealign (w[20], w[21], offset);
      w[36] = hc_bytealign (w[19], w[20], offset);
      w[35] = hc_bytealign (w[18], w[19], offset);
      w[34] = hc_bytealign (w[17], w[18], offset);
      w[33] = hc_bytealign (w[16], w[17], offset);
      w[32] = hc_bytealign (w[15], w[16], offset);
      w[31] = hc_bytealign (w[14], w[15], offset);
      w[30] = hc_bytealign (w[13], w[14], offset);
      w[29] = hc_bytealign (w[12], w[13], offset);
      w[28] = hc_bytealign (w[11], w[12], offset);
      w[27] = hc_bytealign (w[10], w[11], offset);
      w[26] = hc_bytealign (w[ 9], w[10], offset);
      w[25] = hc_bytealign (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign (w[45], w[46], offset);
      w[62] = hc_bytealign (w[44], w[45], offset);
      w[61] = hc_bytealign (w[43], w[44], offset);
      w[60] = hc_bytealign (w[42], w[43], offset);
      w[59] = hc_bytealign (w[41], w[42], offset);
      w[58] = hc_bytealign (w[40], w[41], offset);
      w[57] = hc_bytealign (w[39], w[40], offset);
      w[56] = hc_bytealign (w[38], w[39], offset);
      w[55] = hc_bytealign (w[37], w[38], offset);
      w[54] = hc_bytealign (w[36], w[37], offset);
      w[53] = hc_bytealign (w[35], w[36], offset);
      w[52] = hc_bytealign (w[34], w[35], offset);
      w[51] = hc_bytealign (w[33], w[34], offset);
      w[50] = hc_bytealign (w[32], w[33], offset);
      w[49] = hc_bytealign (w[31], w[32], offset);
      w[48] = hc_bytealign (w[30], w[31], offset);
      w[47] = hc_bytealign (w[29], w[30], offset);
      w[46] = hc_bytealign (w[28], w[29], offset);
      w[45] = hc_bytealign (w[27], w[28], offset);
      w[44] = hc_bytealign (w[26], w[27], offset);
      w[43] = hc_bytealign (w[25], w[26], offset);
      w[42] = hc_bytealign (w[24], w[25], offset);
      w[41] = hc_bytealign (w[23], w[24], offset);
      w[40] = hc_bytealign (w[22], w[23], offset);
      w[39] = hc_bytealign (w[21], w[22], offset);
      w[38] = hc_bytealign (w[20], w[21], offset);
      w[37] = hc_bytealign (w[19], w[20], offset);
      w[36] = hc_bytealign (w[18], w[19], offset);
      w[35] = hc_bytealign (w[17], w[18], offset);
      w[34] = hc_bytealign (w[16], w[17], offset);
      w[33] = hc_bytealign (w[15], w[16], offset);
      w[32] = hc_bytealign (w[14], w[15], offset);
      w[31] = hc_bytealign (w[13], w[14], offset);
      w[30] = hc_bytealign (w[12], w[13], offset);
      w[29] = hc_bytealign (w[11], w[12], offset);
      w[28] = hc_bytealign (w[10], w[11], offset);
      w[27] = hc_bytealign (w[ 9], w[10], offset);
      w[26] = hc_bytealign (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign (w[44], w[45], offset);
      w[62] = hc_bytealign (w[43], w[44], offset);
      w[61] = hc_bytealign (w[42], w[43], offset);
      w[60] = hc_bytealign (w[41], w[42], offset);
      w[59] = hc_bytealign (w[40], w[41], offset);
      w[58] = hc_bytealign (w[39], w[40], offset);
      w[57] = hc_bytealign (w[38], w[39], offset);
      w[56] = hc_bytealign (w[37], w[38], offset);
      w[55] = hc_bytealign (w[36], w[37], offset);
      w[54] = hc_bytealign (w[35], w[36], offset);
      w[53] = hc_bytealign (w[34], w[35], offset);
      w[52] = hc_bytealign (w[33], w[34], offset);
      w[51] = hc_bytealign (w[32], w[33], offset);
      w[50] = hc_bytealign (w[31], w[32], offset);
      w[49] = hc_bytealign (w[30], w[31], offset);
      w[48] = hc_bytealign (w[29], w[30], offset);
      w[47] = hc_bytealign (w[28], w[29], offset);
      w[46] = hc_bytealign (w[27], w[28], offset);
      w[45] = hc_bytealign (w[26], w[27], offset);
      w[44] = hc_bytealign (w[25], w[26], offset);
      w[43] = hc_bytealign (w[24], w[25], offset);
      w[42] = hc_bytealign (w[23], w[24], offset);
      w[41] = hc_bytealign (w[22], w[23], offset);
      w[40] = hc_bytealign (w[21], w[22], offset);
      w[39] = hc_bytealign (w[20], w[21], offset);
      w[38] = hc_bytealign (w[19], w[20], offset);
      w[37] = hc_bytealign (w[18], w[19], offset);
      w[36] = hc_bytealign (w[17], w[18], offset);
      w[35] = hc_bytealign (w[16], w[17], offset);
      w[34] = hc_bytealign (w[15], w[16], offset);
      w[33] = hc_bytealign (w[14], w[15], offset);
      w[32] = hc_bytealign (w[13], w[14], offset);
      w[31] = hc_bytealign (w[12], w[13], offset);
      w[30] = hc_bytealign (w[11], w[12], offset);
      w[29] = hc_bytealign (w[10], w[11], offset);
      w[28] = hc_bytealign (w[ 9], w[10], offset);
      w[27] = hc_bytealign (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign (w[43], w[44], offset);
      w[62] = hc_bytealign (w[42], w[43], offset);
      w[61] = hc_bytealign (w[41], w[42], offset);
      w[60] = hc_bytealign (w[40], w[41], offset);
      w[59] = hc_bytealign (w[39], w[40], offset);
      w[58] = hc_bytealign (w[38], w[39], offset);
      w[57] = hc_bytealign (w[37], w[38], offset);
      w[56] = hc_bytealign (w[36], w[37], offset);
      w[55] = hc_bytealign (w[35], w[36], offset);
      w[54] = hc_bytealign (w[34], w[35], offset);
      w[53] = hc_bytealign (w[33], w[34], offset);
      w[52] = hc_bytealign (w[32], w[33], offset);
      w[51] = hc_bytealign (w[31], w[32], offset);
      w[50] = hc_bytealign (w[30], w[31], offset);
      w[49] = hc_bytealign (w[29], w[30], offset);
      w[48] = hc_bytealign (w[28], w[29], offset);
      w[47] = hc_bytealign (w[27], w[28], offset);
      w[46] = hc_bytealign (w[26], w[27], offset);
      w[45] = hc_bytealign (w[25], w[26], offset);
      w[44] = hc_bytealign (w[24], w[25], offset);
      w[43] = hc_bytealign (w[23], w[24], offset);
      w[42] = hc_bytealign (w[22], w[23], offset);
      w[41] = hc_bytealign (w[21], w[22], offset);
      w[40] = hc_bytealign (w[20], w[21], offset);
      w[39] = hc_bytealign (w[19], w[20], offset);
      w[38] = hc_bytealign (w[18], w[19], offset);
      w[37] = hc_bytealign (w[17], w[18], offset);
      w[36] = hc_bytealign (w[16], w[17], offset);
      w[35] = hc_bytealign (w[15], w[16], offset);
      w[34] = hc_bytealign (w[14], w[15], offset);
      w[33] = hc_bytealign (w[13], w[14], offset);
      w[32] = hc_bytealign (w[12], w[13], offset);
      w[31] = hc_bytealign (w[11], w[12], offset);
      w[30] = hc_bytealign (w[10], w[11], offset);
      w[29] = hc_bytealign (w[ 9], w[10], offset);
      w[28] = hc_bytealign (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign (w[42], w[43], offset);
      w[62] = hc_bytealign (w[41], w[42], offset);
      w[61] = hc_bytealign (w[40], w[41], offset);
      w[60] = hc_bytealign (w[39], w[40], offset);
      w[59] = hc_bytealign (w[38], w[39], offset);
      w[58] = hc_bytealign (w[37], w[38], offset);
      w[57] = hc_bytealign (w[36], w[37], offset);
      w[56] = hc_bytealign (w[35], w[36], offset);
      w[55] = hc_bytealign (w[34], w[35], offset);
      w[54] = hc_bytealign (w[33], w[34], offset);
      w[53] = hc_bytealign (w[32], w[33], offset);
      w[52] = hc_bytealign (w[31], w[32], offset);
      w[51] = hc_bytealign (w[30], w[31], offset);
      w[50] = hc_bytealign (w[29], w[30], offset);
      w[49] = hc_bytealign (w[28], w[29], offset);
      w[48] = hc_bytealign (w[27], w[28], offset);
      w[47] = hc_bytealign (w[26], w[27], offset);
      w[46] = hc_bytealign (w[25], w[26], offset);
      w[45] = hc_bytealign (w[24], w[25], offset);
      w[44] = hc_bytealign (w[23], w[24], offset);
      w[43] = hc_bytealign (w[22], w[23], offset);
      w[42] = hc_bytealign (w[21], w[22], offset);
      w[41] = hc_bytealign (w[20], w[21], offset);
      w[40] = hc_bytealign (w[19], w[20], offset);
      w[39] = hc_bytealign (w[18], w[19], offset);
      w[38] = hc_bytealign (w[17], w[18], offset);
      w[37] = hc_bytealign (w[16], w[17], offset);
      w[36] = hc_bytealign (w[15], w[16], offset);
      w[35] = hc_bytealign (w[14], w[15], offset);
      w[34] = hc_bytealign (w[13], w[14], offset);
      w[33] = hc_bytealign (w[12], w[13], offset);
      w[32] = hc_bytealign (w[11], w[12], offset);
      w[31] = hc_bytealign (w[10], w[11], offset);
      w[30] = hc_bytealign (w[ 9], w[10], offset);
      w[29] = hc_bytealign (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign (w[41], w[42], offset);
      w[62] = hc_bytealign (w[40], w[41], offset);
      w[61] = hc_bytealign (w[39], w[40], offset);
      w[60] = hc_bytealign (w[38], w[39], offset);
      w[59] = hc_bytealign (w[37], w[38], offset);
      w[58] = hc_bytealign (w[36], w[37], offset);
      w[57] = hc_bytealign (w[35], w[36], offset);
      w[56] = hc_bytealign (w[34], w[35], offset);
      w[55] = hc_bytealign (w[33], w[34], offset);
      w[54] = hc_bytealign (w[32], w[33], offset);
      w[53] = hc_bytealign (w[31], w[32], offset);
      w[52] = hc_bytealign (w[30], w[31], offset);
      w[51] = hc_bytealign (w[29], w[30], offset);
      w[50] = hc_bytealign (w[28], w[29], offset);
      w[49] = hc_bytealign (w[27], w[28], offset);
      w[48] = hc_bytealign (w[26], w[27], offset);
      w[47] = hc_bytealign (w[25], w[26], offset);
      w[46] = hc_bytealign (w[24], w[25], offset);
      w[45] = hc_bytealign (w[23], w[24], offset);
      w[44] = hc_bytealign (w[22], w[23], offset);
      w[43] = hc_bytealign (w[21], w[22], offset);
      w[42] = hc_bytealign (w[20], w[21], offset);
      w[41] = hc_bytealign (w[19], w[20], offset);
      w[40] = hc_bytealign (w[18], w[19], offset);
      w[39] = hc_bytealign (w[17], w[18], offset);
      w[38] = hc_bytealign (w[16], w[17], offset);
      w[37] = hc_bytealign (w[15], w[16], offset);
      w[36] = hc_bytealign (w[14], w[15], offset);
      w[35] = hc_bytealign (w[13], w[14], offset);
      w[34] = hc_bytealign (w[12], w[13], offset);
      w[33] = hc_bytealign (w[11], w[12], offset);
      w[32] = hc_bytealign (w[10], w[11], offset);
      w[31] = hc_bytealign (w[ 9], w[10], offset);
      w[30] = hc_bytealign (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign (w[40], w[41], offset);
      w[62] = hc_bytealign (w[39], w[40], offset);
      w[61] = hc_bytealign (w[38], w[39], offset);
      w[60] = hc_bytealign (w[37], w[38], offset);
      w[59] = hc_bytealign (w[36], w[37], offset);
      w[58] = hc_bytealign (w[35], w[36], offset);
      w[57] = hc_bytealign (w[34], w[35], offset);
      w[56] = hc_bytealign (w[33], w[34], offset);
      w[55] = hc_bytealign (w[32], w[33], offset);
      w[54] = hc_bytealign (w[31], w[32], offset);
      w[53] = hc_bytealign (w[30], w[31], offset);
      w[52] = hc_bytealign (w[29], w[30], offset);
      w[51] = hc_bytealign (w[28], w[29], offset);
      w[50] = hc_bytealign (w[27], w[28], offset);
      w[49] = hc_bytealign (w[26], w[27], offset);
      w[48] = hc_bytealign (w[25], w[26], offset);
      w[47] = hc_bytealign (w[24], w[25], offset);
      w[46] = hc_bytealign (w[23], w[24], offset);
      w[45] = hc_bytealign (w[22], w[23], offset);
      w[44] = hc_bytealign (w[21], w[22], offset);
      w[43] = hc_bytealign (w[20], w[21], offset);
      w[42] = hc_bytealign (w[19], w[20], offset);
      w[41] = hc_bytealign (w[18], w[19], offset);
      w[40] = hc_bytealign (w[17], w[18], offset);
      w[39] = hc_bytealign (w[16], w[17], offset);
      w[38] = hc_bytealign (w[15], w[16], offset);
      w[37] = hc_bytealign (w[14], w[15], offset);
      w[36] = hc_bytealign (w[13], w[14], offset);
      w[35] = hc_bytealign (w[12], w[13], offset);
      w[34] = hc_bytealign (w[11], w[12], offset);
      w[33] = hc_bytealign (w[10], w[11], offset);
      w[32] = hc_bytealign (w[ 9], w[10], offset);
      w[31] = hc_bytealign (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign (w[39], w[40], offset);
      w[62] = hc_bytealign (w[38], w[39], offset);
      w[61] = hc_bytealign (w[37], w[38], offset);
      w[60] = hc_bytealign (w[36], w[37], offset);
      w[59] = hc_bytealign (w[35], w[36], offset);
      w[58] = hc_bytealign (w[34], w[35], offset);
      w[57] = hc_bytealign (w[33], w[34], offset);
      w[56] = hc_bytealign (w[32], w[33], offset);
      w[55] = hc_bytealign (w[31], w[32], offset);
      w[54] = hc_bytealign (w[30], w[31], offset);
      w[53] = hc_bytealign (w[29], w[30], offset);
      w[52] = hc_bytealign (w[28], w[29], offset);
      w[51] = hc_bytealign (w[27], w[28], offset);
      w[50] = hc_bytealign (w[26], w[27], offset);
      w[49] = hc_bytealign (w[25], w[26], offset);
      w[48] = hc_bytealign (w[24], w[25], offset);
      w[47] = hc_bytealign (w[23], w[24], offset);
      w[46] = hc_bytealign (w[22], w[23], offset);
      w[45] = hc_bytealign (w[21], w[22], offset);
      w[44] = hc_bytealign (w[20], w[21], offset);
      w[43] = hc_bytealign (w[19], w[20], offset);
      w[42] = hc_bytealign (w[18], w[19], offset);
      w[41] = hc_bytealign (w[17], w[18], offset);
      w[40] = hc_bytealign (w[16], w[17], offset);
      w[39] = hc_bytealign (w[15], w[16], offset);
      w[38] = hc_bytealign (w[14], w[15], offset);
      w[37] = hc_bytealign (w[13], w[14], offset);
      w[36] = hc_bytealign (w[12], w[13], offset);
      w[35] = hc_bytealign (w[11], w[12], offset);
      w[34] = hc_bytealign (w[10], w[11], offset);
      w[33] = hc_bytealign (w[ 9], w[10], offset);
      w[32] = hc_bytealign (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign (w[38], w[39], offset);
      w[62] = hc_bytealign (w[37], w[38], offset);
      w[61] = hc_bytealign (w[36], w[37], offset);
      w[60] = hc_bytealign (w[35], w[36], offset);
      w[59] = hc_bytealign (w[34], w[35], offset);
      w[58] = hc_bytealign (w[33], w[34], offset);
      w[57] = hc_bytealign (w[32], w[33], offset);
      w[56] = hc_bytealign (w[31], w[32], offset);
      w[55] = hc_bytealign (w[30], w[31], offset);
      w[54] = hc_bytealign (w[29], w[30], offset);
      w[53] = hc_bytealign (w[28], w[29], offset);
      w[52] = hc_bytealign (w[27], w[28], offset);
      w[51] = hc_bytealign (w[26], w[27], offset);
      w[50] = hc_bytealign (w[25], w[26], offset);
      w[49] = hc_bytealign (w[24], w[25], offset);
      w[48] = hc_bytealign (w[23], w[24], offset);
      w[47] = hc_bytealign (w[22], w[23], offset);
      w[46] = hc_bytealign (w[21], w[22], offset);
      w[45] = hc_bytealign (w[20], w[21], offset);
      w[44] = hc_bytealign (w[19], w[20], offset);
      w[43] = hc_bytealign (w[18], w[19], offset);
      w[42] = hc_bytealign (w[17], w[18], offset);
      w[41] = hc_bytealign (w[16], w[17], offset);
      w[40] = hc_bytealign (w[15], w[16], offset);
      w[39] = hc_bytealign (w[14], w[15], offset);
      w[38] = hc_bytealign (w[13], w[14], offset);
      w[37] = hc_bytealign (w[12], w[13], offset);
      w[36] = hc_bytealign (w[11], w[12], offset);
      w[35] = hc_bytealign (w[10], w[11], offset);
      w[34] = hc_bytealign (w[ 9], w[10], offset);
      w[33] = hc_bytealign (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign (w[37], w[38], offset);
      w[62] = hc_bytealign (w[36], w[37], offset);
      w[61] = hc_bytealign (w[35], w[36], offset);
      w[60] = hc_bytealign (w[34], w[35], offset);
      w[59] = hc_bytealign (w[33], w[34], offset);
      w[58] = hc_bytealign (w[32], w[33], offset);
      w[57] = hc_bytealign (w[31], w[32], offset);
      w[56] = hc_bytealign (w[30], w[31], offset);
      w[55] = hc_bytealign (w[29], w[30], offset);
      w[54] = hc_bytealign (w[28], w[29], offset);
      w[53] = hc_bytealign (w[27], w[28], offset);
      w[52] = hc_bytealign (w[26], w[27], offset);
      w[51] = hc_bytealign (w[25], w[26], offset);
      w[50] = hc_bytealign (w[24], w[25], offset);
      w[49] = hc_bytealign (w[23], w[24], offset);
      w[48] = hc_bytealign (w[22], w[23], offset);
      w[47] = hc_bytealign (w[21], w[22], offset);
      w[46] = hc_bytealign (w[20], w[21], offset);
      w[45] = hc_bytealign (w[19], w[20], offset);
      w[44] = hc_bytealign (w[18], w[19], offset);
      w[43] = hc_bytealign (w[17], w[18], offset);
      w[42] = hc_bytealign (w[16], w[17], offset);
      w[41] = hc_bytealign (w[15], w[16], offset);
      w[40] = hc_bytealign (w[14], w[15], offset);
      w[39] = hc_bytealign (w[13], w[14], offset);
      w[38] = hc_bytealign (w[12], w[13], offset);
      w[37] = hc_bytealign (w[11], w[12], offset);
      w[36] = hc_bytealign (w[10], w[11], offset);
      w[35] = hc_bytealign (w[ 9], w[10], offset);
      w[34] = hc_bytealign (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign (w[36], w[37], offset);
      w[62] = hc_bytealign (w[35], w[36], offset);
      w[61] = hc_bytealign (w[34], w[35], offset);
      w[60] = hc_bytealign (w[33], w[34], offset);
      w[59] = hc_bytealign (w[32], w[33], offset);
      w[58] = hc_bytealign (w[31], w[32], offset);
      w[57] = hc_bytealign (w[30], w[31], offset);
      w[56] = hc_bytealign (w[29], w[30], offset);
      w[55] = hc_bytealign (w[28], w[29], offset);
      w[54] = hc_bytealign (w[27], w[28], offset);
      w[53] = hc_bytealign (w[26], w[27], offset);
      w[52] = hc_bytealign (w[25], w[26], offset);
      w[51] = hc_bytealign (w[24], w[25], offset);
      w[50] = hc_bytealign (w[23], w[24], offset);
      w[49] = hc_bytealign (w[22], w[23], offset);
      w[48] = hc_bytealign (w[21], w[22], offset);
      w[47] = hc_bytealign (w[20], w[21], offset);
      w[46] = hc_bytealign (w[19], w[20], offset);
      w[45] = hc_bytealign (w[18], w[19], offset);
      w[44] = hc_bytealign (w[17], w[18], offset);
      w[43] = hc_bytealign (w[16], w[17], offset);
      w[42] = hc_bytealign (w[15], w[16], offset);
      w[41] = hc_bytealign (w[14], w[15], offset);
      w[40] = hc_bytealign (w[13], w[14], offset);
      w[39] = hc_bytealign (w[12], w[13], offset);
      w[38] = hc_bytealign (w[11], w[12], offset);
      w[37] = hc_bytealign (w[10], w[11], offset);
      w[36] = hc_bytealign (w[ 9], w[10], offset);
      w[35] = hc_bytealign (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign (w[35], w[36], offset);
      w[62] = hc_bytealign (w[34], w[35], offset);
      w[61] = hc_bytealign (w[33], w[34], offset);
      w[60] = hc_bytealign (w[32], w[33], offset);
      w[59] = hc_bytealign (w[31], w[32], offset);
      w[58] = hc_bytealign (w[30], w[31], offset);
      w[57] = hc_bytealign (w[29], w[30], offset);
      w[56] = hc_bytealign (w[28], w[29], offset);
      w[55] = hc_bytealign (w[27], w[28], offset);
      w[54] = hc_bytealign (w[26], w[27], offset);
      w[53] = hc_bytealign (w[25], w[26], offset);
      w[52] = hc_bytealign (w[24], w[25], offset);
      w[51] = hc_bytealign (w[23], w[24], offset);
      w[50] = hc_bytealign (w[22], w[23], offset);
      w[49] = hc_bytealign (w[21], w[22], offset);
      w[48] = hc_bytealign (w[20], w[21], offset);
      w[47] = hc_bytealign (w[19], w[20], offset);
      w[46] = hc_bytealign (w[18], w[19], offset);
      w[45] = hc_bytealign (w[17], w[18], offset);
      w[44] = hc_bytealign (w[16], w[17], offset);
      w[43] = hc_bytealign (w[15], w[16], offset);
      w[42] = hc_bytealign (w[14], w[15], offset);
      w[41] = hc_bytealign (w[13], w[14], offset);
      w[40] = hc_bytealign (w[12], w[13], offset);
      w[39] = hc_bytealign (w[11], w[12], offset);
      w[38] = hc_bytealign (w[10], w[11], offset);
      w[37] = hc_bytealign (w[ 9], w[10], offset);
      w[36] = hc_bytealign (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign (w[34], w[35], offset);
      w[62] = hc_bytealign (w[33], w[34], offset);
      w[61] = hc_bytealign (w[32], w[33], offset);
      w[60] = hc_bytealign (w[31], w[32], offset);
      w[59] = hc_bytealign (w[30], w[31], offset);
      w[58] = hc_bytealign (w[29], w[30], offset);
      w[57] = hc_bytealign (w[28], w[29], offset);
      w[56] = hc_bytealign (w[27], w[28], offset);
      w[55] = hc_bytealign (w[26], w[27], offset);
      w[54] = hc_bytealign (w[25], w[26], offset);
      w[53] = hc_bytealign (w[24], w[25], offset);
      w[52] = hc_bytealign (w[23], w[24], offset);
      w[51] = hc_bytealign (w[22], w[23], offset);
      w[50] = hc_bytealign (w[21], w[22], offset);
      w[49] = hc_bytealign (w[20], w[21], offset);
      w[48] = hc_bytealign (w[19], w[20], offset);
      w[47] = hc_bytealign (w[18], w[19], offset);
      w[46] = hc_bytealign (w[17], w[18], offset);
      w[45] = hc_bytealign (w[16], w[17], offset);
      w[44] = hc_bytealign (w[15], w[16], offset);
      w[43] = hc_bytealign (w[14], w[15], offset);
      w[42] = hc_bytealign (w[13], w[14], offset);
      w[41] = hc_bytealign (w[12], w[13], offset);
      w[40] = hc_bytealign (w[11], w[12], offset);
      w[39] = hc_bytealign (w[10], w[11], offset);
      w[38] = hc_bytealign (w[ 9], w[10], offset);
      w[37] = hc_bytealign (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign (w[33], w[34], offset);
      w[62] = hc_bytealign (w[32], w[33], offset);
      w[61] = hc_bytealign (w[31], w[32], offset);
      w[60] = hc_bytealign (w[30], w[31], offset);
      w[59] = hc_bytealign (w[29], w[30], offset);
      w[58] = hc_bytealign (w[28], w[29], offset);
      w[57] = hc_bytealign (w[27], w[28], offset);
      w[56] = hc_bytealign (w[26], w[27], offset);
      w[55] = hc_bytealign (w[25], w[26], offset);
      w[54] = hc_bytealign (w[24], w[25], offset);
      w[53] = hc_bytealign (w[23], w[24], offset);
      w[52] = hc_bytealign (w[22], w[23], offset);
      w[51] = hc_bytealign (w[21], w[22], offset);
      w[50] = hc_bytealign (w[20], w[21], offset);
      w[49] = hc_bytealign (w[19], w[20], offset);
      w[48] = hc_bytealign (w[18], w[19], offset);
      w[47] = hc_bytealign (w[17], w[18], offset);
      w[46] = hc_bytealign (w[16], w[17], offset);
      w[45] = hc_bytealign (w[15], w[16], offset);
      w[44] = hc_bytealign (w[14], w[15], offset);
      w[43] = hc_bytealign (w[13], w[14], offset);
      w[42] = hc_bytealign (w[12], w[13], offset);
      w[41] = hc_bytealign (w[11], w[12], offset);
      w[40] = hc_bytealign (w[10], w[11], offset);
      w[39] = hc_bytealign (w[ 9], w[10], offset);
      w[38] = hc_bytealign (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign (w[32], w[33], offset);
      w[62] = hc_bytealign (w[31], w[32], offset);
      w[61] = hc_bytealign (w[30], w[31], offset);
      w[60] = hc_bytealign (w[29], w[30], offset);
      w[59] = hc_bytealign (w[28], w[29], offset);
      w[58] = hc_bytealign (w[27], w[28], offset);
      w[57] = hc_bytealign (w[26], w[27], offset);
      w[56] = hc_bytealign (w[25], w[26], offset);
      w[55] = hc_bytealign (w[24], w[25], offset);
      w[54] = hc_bytealign (w[23], w[24], offset);
      w[53] = hc_bytealign (w[22], w[23], offset);
      w[52] = hc_bytealign (w[21], w[22], offset);
      w[51] = hc_bytealign (w[20], w[21], offset);
      w[50] = hc_bytealign (w[19], w[20], offset);
      w[49] = hc_bytealign (w[18], w[19], offset);
      w[48] = hc_bytealign (w[17], w[18], offset);
      w[47] = hc_bytealign (w[16], w[17], offset);
      w[46] = hc_bytealign (w[15], w[16], offset);
      w[45] = hc_bytealign (w[14], w[15], offset);
      w[44] = hc_bytealign (w[13], w[14], offset);
      w[43] = hc_bytealign (w[12], w[13], offset);
      w[42] = hc_bytealign (w[11], w[12], offset);
      w[41] = hc_bytealign (w[10], w[11], offset);
      w[40] = hc_bytealign (w[ 9], w[10], offset);
      w[39] = hc_bytealign (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign (w[31], w[32], offset);
      w[62] = hc_bytealign (w[30], w[31], offset);
      w[61] = hc_bytealign (w[29], w[30], offset);
      w[60] = hc_bytealign (w[28], w[29], offset);
      w[59] = hc_bytealign (w[27], w[28], offset);
      w[58] = hc_bytealign (w[26], w[27], offset);
      w[57] = hc_bytealign (w[25], w[26], offset);
      w[56] = hc_bytealign (w[24], w[25], offset);
      w[55] = hc_bytealign (w[23], w[24], offset);
      w[54] = hc_bytealign (w[22], w[23], offset);
      w[53] = hc_bytealign (w[21], w[22], offset);
      w[52] = hc_bytealign (w[20], w[21], offset);
      w[51] = hc_bytealign (w[19], w[20], offset);
      w[50] = hc_bytealign (w[18], w[19], offset);
      w[49] = hc_bytealign (w[17], w[18], offset);
      w[48] = hc_bytealign (w[16], w[17], offset);
      w[47] = hc_bytealign (w[15], w[16], offset);
      w[46] = hc_bytealign (w[14], w[15], offset);
      w[45] = hc_bytealign (w[13], w[14], offset);
      w[44] = hc_bytealign (w[12], w[13], offset);
      w[43] = hc_bytealign (w[11], w[12], offset);
      w[42] = hc_bytealign (w[10], w[11], offset);
      w[41] = hc_bytealign (w[ 9], w[10], offset);
      w[40] = hc_bytealign (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign (w[30], w[31], offset);
      w[62] = hc_bytealign (w[29], w[30], offset);
      w[61] = hc_bytealign (w[28], w[29], offset);
      w[60] = hc_bytealign (w[27], w[28], offset);
      w[59] = hc_bytealign (w[26], w[27], offset);
      w[58] = hc_bytealign (w[25], w[26], offset);
      w[57] = hc_bytealign (w[24], w[25], offset);
      w[56] = hc_bytealign (w[23], w[24], offset);
      w[55] = hc_bytealign (w[22], w[23], offset);
      w[54] = hc_bytealign (w[21], w[22], offset);
      w[53] = hc_bytealign (w[20], w[21], offset);
      w[52] = hc_bytealign (w[19], w[20], offset);
      w[51] = hc_bytealign (w[18], w[19], offset);
      w[50] = hc_bytealign (w[17], w[18], offset);
      w[49] = hc_bytealign (w[16], w[17], offset);
      w[48] = hc_bytealign (w[15], w[16], offset);
      w[47] = hc_bytealign (w[14], w[15], offset);
      w[46] = hc_bytealign (w[13], w[14], offset);
      w[45] = hc_bytealign (w[12], w[13], offset);
      w[44] = hc_bytealign (w[11], w[12], offset);
      w[43] = hc_bytealign (w[10], w[11], offset);
      w[42] = hc_bytealign (w[ 9], w[10], offset);
      w[41] = hc_bytealign (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign (w[29], w[30], offset);
      w[62] = hc_bytealign (w[28], w[29], offset);
      w[61] = hc_bytealign (w[27], w[28], offset);
      w[60] = hc_bytealign (w[26], w[27], offset);
      w[59] = hc_bytealign (w[25], w[26], offset);
      w[58] = hc_bytealign (w[24], w[25], offset);
      w[57] = hc_bytealign (w[23], w[24], offset);
      w[56] = hc_bytealign (w[22], w[23], offset);
      w[55] = hc_bytealign (w[21], w[22], offset);
      w[54] = hc_bytealign (w[20], w[21], offset);
      w[53] = hc_bytealign (w[19], w[20], offset);
      w[52] = hc_bytealign (w[18], w[19], offset);
      w[51] = hc_bytealign (w[17], w[18], offset);
      w[50] = hc_bytealign (w[16], w[17], offset);
      w[49] = hc_bytealign (w[15], w[16], offset);
      w[48] = hc_bytealign (w[14], w[15], offset);
      w[47] = hc_bytealign (w[13], w[14], offset);
      w[46] = hc_bytealign (w[12], w[13], offset);
      w[45] = hc_bytealign (w[11], w[12], offset);
      w[44] = hc_bytealign (w[10], w[11], offset);
      w[43] = hc_bytealign (w[ 9], w[10], offset);
      w[42] = hc_bytealign (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign (w[28], w[29], offset);
      w[62] = hc_bytealign (w[27], w[28], offset);
      w[61] = hc_bytealign (w[26], w[27], offset);
      w[60] = hc_bytealign (w[25], w[26], offset);
      w[59] = hc_bytealign (w[24], w[25], offset);
      w[58] = hc_bytealign (w[23], w[24], offset);
      w[57] = hc_bytealign (w[22], w[23], offset);
      w[56] = hc_bytealign (w[21], w[22], offset);
      w[55] = hc_bytealign (w[20], w[21], offset);
      w[54] = hc_bytealign (w[19], w[20], offset);
      w[53] = hc_bytealign (w[18], w[19], offset);
      w[52] = hc_bytealign (w[17], w[18], offset);
      w[51] = hc_bytealign (w[16], w[17], offset);
      w[50] = hc_bytealign (w[15], w[16], offset);
      w[49] = hc_bytealign (w[14], w[15], offset);
      w[48] = hc_bytealign (w[13], w[14], offset);
      w[47] = hc_bytealign (w[12], w[13], offset);
      w[46] = hc_bytealign (w[11], w[12], offset);
      w[45] = hc_bytealign (w[10], w[11], offset);
      w[44] = hc_bytealign (w[ 9], w[10], offset);
      w[43] = hc_bytealign (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign (w[27], w[28], offset);
      w[62] = hc_bytealign (w[26], w[27], offset);
      w[61] = hc_bytealign (w[25], w[26], offset);
      w[60] = hc_bytealign (w[24], w[25], offset);
      w[59] = hc_bytealign (w[23], w[24], offset);
      w[58] = hc_bytealign (w[22], w[23], offset);
      w[57] = hc_bytealign (w[21], w[22], offset);
      w[56] = hc_bytealign (w[20], w[21], offset);
      w[55] = hc_bytealign (w[19], w[20], offset);
      w[54] = hc_bytealign (w[18], w[19], offset);
      w[53] = hc_bytealign (w[17], w[18], offset);
      w[52] = hc_bytealign (w[16], w[17], offset);
      w[51] = hc_bytealign (w[15], w[16], offset);
      w[50] = hc_bytealign (w[14], w[15], offset);
      w[49] = hc_bytealign (w[13], w[14], offset);
      w[48] = hc_bytealign (w[12], w[13], offset);
      w[47] = hc_bytealign (w[11], w[12], offset);
      w[46] = hc_bytealign (w[10], w[11], offset);
      w[45] = hc_bytealign (w[ 9], w[10], offset);
      w[44] = hc_bytealign (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign (w[26], w[27], offset);
      w[62] = hc_bytealign (w[25], w[26], offset);
      w[61] = hc_bytealign (w[24], w[25], offset);
      w[60] = hc_bytealign (w[23], w[24], offset);
      w[59] = hc_bytealign (w[22], w[23], offset);
      w[58] = hc_bytealign (w[21], w[22], offset);
      w[57] = hc_bytealign (w[20], w[21], offset);
      w[56] = hc_bytealign (w[19], w[20], offset);
      w[55] = hc_bytealign (w[18], w[19], offset);
      w[54] = hc_bytealign (w[17], w[18], offset);
      w[53] = hc_bytealign (w[16], w[17], offset);
      w[52] = hc_bytealign (w[15], w[16], offset);
      w[51] = hc_bytealign (w[14], w[15], offset);
      w[50] = hc_bytealign (w[13], w[14], offset);
      w[49] = hc_bytealign (w[12], w[13], offset);
      w[48] = hc_bytealign (w[11], w[12], offset);
      w[47] = hc_bytealign (w[10], w[11], offset);
      w[46] = hc_bytealign (w[ 9], w[10], offset);
      w[45] = hc_bytealign (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign (w[25], w[26], offset);
      w[62] = hc_bytealign (w[24], w[25], offset);
      w[61] = hc_bytealign (w[23], w[24], offset);
      w[60] = hc_bytealign (w[22], w[23], offset);
      w[59] = hc_bytealign (w[21], w[22], offset);
      w[58] = hc_bytealign (w[20], w[21], offset);
      w[57] = hc_bytealign (w[19], w[20], offset);
      w[56] = hc_bytealign (w[18], w[19], offset);
      w[55] = hc_bytealign (w[17], w[18], offset);
      w[54] = hc_bytealign (w[16], w[17], offset);
      w[53] = hc_bytealign (w[15], w[16], offset);
      w[52] = hc_bytealign (w[14], w[15], offset);
      w[51] = hc_bytealign (w[13], w[14], offset);
      w[50] = hc_bytealign (w[12], w[13], offset);
      w[49] = hc_bytealign (w[11], w[12], offset);
      w[48] = hc_bytealign (w[10], w[11], offset);
      w[47] = hc_bytealign (w[ 9], w[10], offset);
      w[46] = hc_bytealign (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign (w[24], w[25], offset);
      w[62] = hc_bytealign (w[23], w[24], offset);
      w[61] = hc_bytealign (w[22], w[23], offset);
      w[60] = hc_bytealign (w[21], w[22], offset);
      w[59] = hc_bytealign (w[20], w[21], offset);
      w[58] = hc_bytealign (w[19], w[20], offset);
      w[57] = hc_bytealign (w[18], w[19], offset);
      w[56] = hc_bytealign (w[17], w[18], offset);
      w[55] = hc_bytealign (w[16], w[17], offset);
      w[54] = hc_bytealign (w[15], w[16], offset);
      w[53] = hc_bytealign (w[14], w[15], offset);
      w[52] = hc_bytealign (w[13], w[14], offset);
      w[51] = hc_bytealign (w[12], w[13], offset);
      w[50] = hc_bytealign (w[11], w[12], offset);
      w[49] = hc_bytealign (w[10], w[11], offset);
      w[48] = hc_bytealign (w[ 9], w[10], offset);
      w[47] = hc_bytealign (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign (w[23], w[24], offset);
      w[62] = hc_bytealign (w[22], w[23], offset);
      w[61] = hc_bytealign (w[21], w[22], offset);
      w[60] = hc_bytealign (w[20], w[21], offset);
      w[59] = hc_bytealign (w[19], w[20], offset);
      w[58] = hc_bytealign (w[18], w[19], offset);
      w[57] = hc_bytealign (w[17], w[18], offset);
      w[56] = hc_bytealign (w[16], w[17], offset);
      w[55] = hc_bytealign (w[15], w[16], offset);
      w[54] = hc_bytealign (w[14], w[15], offset);
      w[53] = hc_bytealign (w[13], w[14], offset);
      w[52] = hc_bytealign (w[12], w[13], offset);
      w[51] = hc_bytealign (w[11], w[12], offset);
      w[50] = hc_bytealign (w[10], w[11], offset);
      w[49] = hc_bytealign (w[ 9], w[10], offset);
      w[48] = hc_bytealign (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign (w[22], w[23], offset);
      w[62] = hc_bytealign (w[21], w[22], offset);
      w[61] = hc_bytealign (w[20], w[21], offset);
      w[60] = hc_bytealign (w[19], w[20], offset);
      w[59] = hc_bytealign (w[18], w[19], offset);
      w[58] = hc_bytealign (w[17], w[18], offset);
      w[57] = hc_bytealign (w[16], w[17], offset);
      w[56] = hc_bytealign (w[15], w[16], offset);
      w[55] = hc_bytealign (w[14], w[15], offset);
      w[54] = hc_bytealign (w[13], w[14], offset);
      w[53] = hc_bytealign (w[12], w[13], offset);
      w[52] = hc_bytealign (w[11], w[12], offset);
      w[51] = hc_bytealign (w[10], w[11], offset);
      w[50] = hc_bytealign (w[ 9], w[10], offset);
      w[49] = hc_bytealign (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign (w[21], w[22], offset);
      w[62] = hc_bytealign (w[20], w[21], offset);
      w[61] = hc_bytealign (w[19], w[20], offset);
      w[60] = hc_bytealign (w[18], w[19], offset);
      w[59] = hc_bytealign (w[17], w[18], offset);
      w[58] = hc_bytealign (w[16], w[17], offset);
      w[57] = hc_bytealign (w[15], w[16], offset);
      w[56] = hc_bytealign (w[14], w[15], offset);
      w[55] = hc_bytealign (w[13], w[14], offset);
      w[54] = hc_bytealign (w[12], w[13], offset);
      w[53] = hc_bytealign (w[11], w[12], offset);
      w[52] = hc_bytealign (w[10], w[11], offset);
      w[51] = hc_bytealign (w[ 9], w[10], offset);
      w[50] = hc_bytealign (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign (w[20], w[21], offset);
      w[62] = hc_bytealign (w[19], w[20], offset);
      w[61] = hc_bytealign (w[18], w[19], offset);
      w[60] = hc_bytealign (w[17], w[18], offset);
      w[59] = hc_bytealign (w[16], w[17], offset);
      w[58] = hc_bytealign (w[15], w[16], offset);
      w[57] = hc_bytealign (w[14], w[15], offset);
      w[56] = hc_bytealign (w[13], w[14], offset);
      w[55] = hc_bytealign (w[12], w[13], offset);
      w[54] = hc_bytealign (w[11], w[12], offset);
      w[53] = hc_bytealign (w[10], w[11], offset);
      w[52] = hc_bytealign (w[ 9], w[10], offset);
      w[51] = hc_bytealign (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign (w[19], w[20], offset);
      w[62] = hc_bytealign (w[18], w[19], offset);
      w[61] = hc_bytealign (w[17], w[18], offset);
      w[60] = hc_bytealign (w[16], w[17], offset);
      w[59] = hc_bytealign (w[15], w[16], offset);
      w[58] = hc_bytealign (w[14], w[15], offset);
      w[57] = hc_bytealign (w[13], w[14], offset);
      w[56] = hc_bytealign (w[12], w[13], offset);
      w[55] = hc_bytealign (w[11], w[12], offset);
      w[54] = hc_bytealign (w[10], w[11], offset);
      w[53] = hc_bytealign (w[ 9], w[10], offset);
      w[52] = hc_bytealign (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign (w[ 1], w[ 2], offset);
      w[44] = hc_bytealign (w[ 0], w[ 1], offset);
      w[43] = hc_bytealign (    0, w[ 0], offset);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_bytealign (w[18], w[19], offset);
      w[62] = hc_bytealign (w[17], w[18], offset);
      w[61] = hc_bytealign (w[16], w[17], offset);
      w[60] = hc_bytealign (w[15], w[16], offset);
      w[59] = hc_bytealign (w[14], w[15], offset);
      w[58] = hc_bytealign (w[13], w[14], offset);
      w[57] = hc_bytealign (w[12], w[13], offset);
      w[56] = hc_bytealign (w[11], w[12], offset);
      w[55] = hc_bytealign (w[10], w[11], offset);
      w[54] = hc_bytealign (w[ 9], w[10], offset);
      w[53] = hc_bytealign (w[ 8], w[ 9], offset);
      w[52] = hc_bytealign (w[ 7], w[ 8], offset);
      w[51] = hc_bytealign (w[ 6], w[ 7], offset);
      w[50] = hc_bytealign (w[ 5], w[ 6], offset);
      w[49] = hc_bytealign (w[ 4], w[ 5], offset);
      w[48] = hc_bytealign (w[ 3], w[ 4], offset);
      w[47] = hc_bytealign (w[ 2], w[ 3], offset);
      w[46] = hc_bytealign (w[ 1], w[ 2], offset);
      w[45] = hc_bytealign (w[ 0], w[ 1], offset);
      w[44] = hc_bytealign (    0, w[ 0], offset);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_bytealign (w[17], w[18], offset);
      w[62] = hc_bytealign (w[16], w[17], offset);
      w[61] = hc_bytealign (w[15], w[16], offset);
      w[60] = hc_bytealign (w[14], w[15], offset);
      w[59] = hc_bytealign (w[13], w[14], offset);
      w[58] = hc_bytealign (w[12], w[13], offset);
      w[57] = hc_bytealign (w[11], w[12], offset);
      w[56] = hc_bytealign (w[10], w[11], offset);
      w[55] = hc_bytealign (w[ 9], w[10], offset);
      w[54] = hc_bytealign (w[ 8], w[ 9], offset);
      w[53] = hc_bytealign (w[ 7], w[ 8], offset);
      w[52] = hc_bytealign (w[ 6], w[ 7], offset);
      w[51] = hc_bytealign (w[ 5], w[ 6], offset);
      w[50] = hc_bytealign (w[ 4], w[ 5], offset);
      w[49] = hc_bytealign (w[ 3], w[ 4], offset);
      w[48] = hc_bytealign (w[ 2], w[ 3], offset);
      w[47] = hc_bytealign (w[ 1], w[ 2], offset);
      w[46] = hc_bytealign (w[ 0], w[ 1], offset);
      w[45] = hc_bytealign (    0, w[ 0], offset);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_bytealign (w[16], w[17], offset);
      w[62] = hc_bytealign (w[15], w[16], offset);
      w[61] = hc_bytealign (w[14], w[15], offset);
      w[60] = hc_bytealign (w[13], w[14], offset);
      w[59] = hc_bytealign (w[12], w[13], offset);
      w[58] = hc_bytealign (w[11], w[12], offset);
      w[57] = hc_bytealign (w[10], w[11], offset);
      w[56] = hc_bytealign (w[ 9], w[10], offset);
      w[55] = hc_bytealign (w[ 8], w[ 9], offset);
      w[54] = hc_bytealign (w[ 7], w[ 8], offset);
      w[53] = hc_bytealign (w[ 6], w[ 7], offset);
      w[52] = hc_bytealign (w[ 5], w[ 6], offset);
      w[51] = hc_bytealign (w[ 4], w[ 5], offset);
      w[50] = hc_bytealign (w[ 3], w[ 4], offset);
      w[49] = hc_bytealign (w[ 2], w[ 3], offset);
      w[48] = hc_bytealign (w[ 1], w[ 2], offset);
      w[47] = hc_bytealign (w[ 0], w[ 1], offset);
      w[46] = hc_bytealign (    0, w[ 0], offset);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_bytealign (w[15], w[16], offset);
      w[62] = hc_bytealign (w[14], w[15], offset);
      w[61] = hc_bytealign (w[13], w[14], offset);
      w[60] = hc_bytealign (w[12], w[13], offset);
      w[59] = hc_bytealign (w[11], w[12], offset);
      w[58] = hc_bytealign (w[10], w[11], offset);
      w[57] = hc_bytealign (w[ 9], w[10], offset);
      w[56] = hc_bytealign (w[ 8], w[ 9], offset);
      w[55] = hc_bytealign (w[ 7], w[ 8], offset);
      w[54] = hc_bytealign (w[ 6], w[ 7], offset);
      w[53] = hc_bytealign (w[ 5], w[ 6], offset);
      w[52] = hc_bytealign (w[ 4], w[ 5], offset);
      w[51] = hc_bytealign (w[ 3], w[ 4], offset);
      w[50] = hc_bytealign (w[ 2], w[ 3], offset);
      w[49] = hc_bytealign (w[ 1], w[ 2], offset);
      w[48] = hc_bytealign (w[ 0], w[ 1], offset);
      w[47] = hc_bytealign (    0, w[ 0], offset);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_bytealign (w[14], w[15], offset);
      w[62] = hc_bytealign (w[13], w[14], offset);
      w[61] = hc_bytealign (w[12], w[13], offset);
      w[60] = hc_bytealign (w[11], w[12], offset);
      w[59] = hc_bytealign (w[10], w[11], offset);
      w[58] = hc_bytealign (w[ 9], w[10], offset);
      w[57] = hc_bytealign (w[ 8], w[ 9], offset);
      w[56] = hc_bytealign (w[ 7], w[ 8], offset);
      w[55] = hc_bytealign (w[ 6], w[ 7], offset);
      w[54] = hc_bytealign (w[ 5], w[ 6], offset);
      w[53] = hc_bytealign (w[ 4], w[ 5], offset);
      w[52] = hc_bytealign (w[ 3], w[ 4], offset);
      w[51] = hc_bytealign (w[ 2], w[ 3], offset);
      w[50] = hc_bytealign (w[ 1], w[ 2], offset);
      w[49] = hc_bytealign (w[ 0], w[ 1], offset);
      w[48] = hc_bytealign (    0, w[ 0], offset);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_bytealign (w[13], w[14], offset);
      w[62] = hc_bytealign (w[12], w[13], offset);
      w[61] = hc_bytealign (w[11], w[12], offset);
      w[60] = hc_bytealign (w[10], w[11], offset);
      w[59] = hc_bytealign (w[ 9], w[10], offset);
      w[58] = hc_bytealign (w[ 8], w[ 9], offset);
      w[57] = hc_bytealign (w[ 7], w[ 8], offset);
      w[56] = hc_bytealign (w[ 6], w[ 7], offset);
      w[55] = hc_bytealign (w[ 5], w[ 6], offset);
      w[54] = hc_bytealign (w[ 4], w[ 5], offset);
      w[53] = hc_bytealign (w[ 3], w[ 4], offset);
      w[52] = hc_bytealign (w[ 2], w[ 3], offset);
      w[51] = hc_bytealign (w[ 1], w[ 2], offset);
      w[50] = hc_bytealign (w[ 0], w[ 1], offset);
      w[49] = hc_bytealign (    0, w[ 0], offset);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_bytealign (w[12], w[13], offset);
      w[62] = hc_bytealign (w[11], w[12], offset);
      w[61] = hc_bytealign (w[10], w[11], offset);
      w[60] = hc_bytealign (w[ 9], w[10], offset);
      w[59] = hc_bytealign (w[ 8], w[ 9], offset);
      w[58] = hc_bytealign (w[ 7], w[ 8], offset);
      w[57] = hc_bytealign (w[ 6], w[ 7], offset);
      w[56] = hc_bytealign (w[ 5], w[ 6], offset);
      w[55] = hc_bytealign (w[ 4], w[ 5], offset);
      w[54] = hc_bytealign (w[ 3], w[ 4], offset);
      w[53] = hc_bytealign (w[ 2], w[ 3], offset);
      w[52] = hc_bytealign (w[ 1], w[ 2], offset);
      w[51] = hc_bytealign (w[ 0], w[ 1], offset);
      w[50] = hc_bytealign (    0, w[ 0], offset);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_bytealign (w[11], w[12], offset);
      w[62] = hc_bytealign (w[10], w[11], offset);
      w[61] = hc_bytealign (w[ 9], w[10], offset);
      w[60] = hc_bytealign (w[ 8], w[ 9], offset);
      w[59] = hc_bytealign (w[ 7], w[ 8], offset);
      w[58] = hc_bytealign (w[ 6], w[ 7], offset);
      w[57] = hc_bytealign (w[ 5], w[ 6], offset);
      w[56] = hc_bytealign (w[ 4], w[ 5], offset);
      w[55] = hc_bytealign (w[ 3], w[ 4], offset);
      w[54] = hc_bytealign (w[ 2], w[ 3], offset);
      w[53] = hc_bytealign (w[ 1], w[ 2], offset);
      w[52] = hc_bytealign (w[ 0], w[ 1], offset);
      w[51] = hc_bytealign (    0, w[ 0], offset);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_bytealign (w[10], w[11], offset);
      w[62] = hc_bytealign (w[ 9], w[10], offset);
      w[61] = hc_bytealign (w[ 8], w[ 9], offset);
      w[60] = hc_bytealign (w[ 7], w[ 8], offset);
      w[59] = hc_bytealign (w[ 6], w[ 7], offset);
      w[58] = hc_bytealign (w[ 5], w[ 6], offset);
      w[57] = hc_bytealign (w[ 4], w[ 5], offset);
      w[56] = hc_bytealign (w[ 3], w[ 4], offset);
      w[55] = hc_bytealign (w[ 2], w[ 3], offset);
      w[54] = hc_bytealign (w[ 1], w[ 2], offset);
      w[53] = hc_bytealign (w[ 0], w[ 1], offset);
      w[52] = hc_bytealign (    0, w[ 0], offset);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_bytealign (w[ 9], w[10], offset);
      w[62] = hc_bytealign (w[ 8], w[ 9], offset);
      w[61] = hc_bytealign (w[ 7], w[ 8], offset);
      w[60] = hc_bytealign (w[ 6], w[ 7], offset);
      w[59] = hc_bytealign (w[ 5], w[ 6], offset);
      w[58] = hc_bytealign (w[ 4], w[ 5], offset);
      w[57] = hc_bytealign (w[ 3], w[ 4], offset);
      w[56] = hc_bytealign (w[ 2], w[ 3], offset);
      w[55] = hc_bytealign (w[ 1], w[ 2], offset);
      w[54] = hc_bytealign (w[ 0], w[ 1], offset);
      w[53] = hc_bytealign (    0, w[ 0], offset);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_bytealign (w[ 8], w[ 9], offset);
      w[62] = hc_bytealign (w[ 7], w[ 8], offset);
      w[61] = hc_bytealign (w[ 6], w[ 7], offset);
      w[60] = hc_bytealign (w[ 5], w[ 6], offset);
      w[59] = hc_bytealign (w[ 4], w[ 5], offset);
      w[58] = hc_bytealign (w[ 3], w[ 4], offset);
      w[57] = hc_bytealign (w[ 2], w[ 3], offset);
      w[56] = hc_bytealign (w[ 1], w[ 2], offset);
      w[55] = hc_bytealign (w[ 0], w[ 1], offset);
      w[54] = hc_bytealign (    0, w[ 0], offset);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_bytealign (w[ 7], w[ 8], offset);
      w[62] = hc_bytealign (w[ 6], w[ 7], offset);
      w[61] = hc_bytealign (w[ 5], w[ 6], offset);
      w[60] = hc_bytealign (w[ 4], w[ 5], offset);
      w[59] = hc_bytealign (w[ 3], w[ 4], offset);
      w[58] = hc_bytealign (w[ 2], w[ 3], offset);
      w[57] = hc_bytealign (w[ 1], w[ 2], offset);
      w[56] = hc_bytealign (w[ 0], w[ 1], offset);
      w[55] = hc_bytealign (    0, w[ 0], offset);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_bytealign (w[ 6], w[ 7], offset);
      w[62] = hc_bytealign (w[ 5], w[ 6], offset);
      w[61] = hc_bytealign (w[ 4], w[ 5], offset);
      w[60] = hc_bytealign (w[ 3], w[ 4], offset);
      w[59] = hc_bytealign (w[ 2], w[ 3], offset);
      w[58] = hc_bytealign (w[ 1], w[ 2], offset);
      w[57] = hc_bytealign (w[ 0], w[ 1], offset);
      w[56] = hc_bytealign (    0, w[ 0], offset);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_bytealign (w[ 5], w[ 6], offset);
      w[62] = hc_bytealign (w[ 4], w[ 5], offset);
      w[61] = hc_bytealign (w[ 3], w[ 4], offset);
      w[60] = hc_bytealign (w[ 2], w[ 3], offset);
      w[59] = hc_bytealign (w[ 1], w[ 2], offset);
      w[58] = hc_bytealign (w[ 0], w[ 1], offset);
      w[57] = hc_bytealign (    0, w[ 0], offset);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_bytealign (w[ 4], w[ 5], offset);
      w[62] = hc_bytealign (w[ 3], w[ 4], offset);
      w[61] = hc_bytealign (w[ 2], w[ 3], offset);
      w[60] = hc_bytealign (w[ 1], w[ 2], offset);
      w[59] = hc_bytealign (w[ 0], w[ 1], offset);
      w[58] = hc_bytealign (    0, w[ 0], offset);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_bytealign (w[ 3], w[ 4], offset);
      w[62] = hc_bytealign (w[ 2], w[ 3], offset);
      w[61] = hc_bytealign (w[ 1], w[ 2], offset);
      w[60] = hc_bytealign (w[ 0], w[ 1], offset);
      w[59] = hc_bytealign (    0, w[ 0], offset);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_bytealign (w[ 2], w[ 3], offset);
      w[62] = hc_bytealign (w[ 1], w[ 2], offset);
      w[61] = hc_bytealign (w[ 0], w[ 1], offset);
      w[60] = hc_bytealign (    0, w[ 0], offset);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_bytealign (w[ 1], w[ 2], offset);
      w[62] = hc_bytealign (w[ 0], w[ 1], offset);
      w[61] = hc_bytealign (    0, w[ 0], offset);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_bytealign (w[ 0], w[ 1], offset);
      w[62] = hc_bytealign (    0, w[ 0], offset);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_bytealign (    0, w[ 0], offset);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_byte_perm (w[63], w[62], selector);
      w[62] = hc_byte_perm (w[62], w[61], selector);
      w[61] = hc_byte_perm (w[61], w[60], selector);
      w[60] = hc_byte_perm (w[60], w[59], selector);
      w[59] = hc_byte_perm (w[59], w[58], selector);
      w[58] = hc_byte_perm (w[58], w[57], selector);
      w[57] = hc_byte_perm (w[57], w[56], selector);
      w[56] = hc_byte_perm (w[56], w[55], selector);
      w[55] = hc_byte_perm (w[55], w[54], selector);
      w[54] = hc_byte_perm (w[54], w[53], selector);
      w[53] = hc_byte_perm (w[53], w[52], selector);
      w[52] = hc_byte_perm (w[52], w[51], selector);
      w[51] = hc_byte_perm (w[51], w[50], selector);
      w[50] = hc_byte_perm (w[50], w[49], selector);
      w[49] = hc_byte_perm (w[49], w[48], selector);
      w[48] = hc_byte_perm (w[48], w[47], selector);
      w[47] = hc_byte_perm (w[47], w[46], selector);
      w[46] = hc_byte_perm (w[46], w[45], selector);
      w[45] = hc_byte_perm (w[45], w[44], selector);
      w[44] = hc_byte_perm (w[44], w[43], selector);
      w[43] = hc_byte_perm (w[43], w[42], selector);
      w[42] = hc_byte_perm (w[42], w[41], selector);
      w[41] = hc_byte_perm (w[41], w[40], selector);
      w[40] = hc_byte_perm (w[40], w[39], selector);
      w[39] = hc_byte_perm (w[39], w[38], selector);
      w[38] = hc_byte_perm (w[38], w[37], selector);
      w[37] = hc_byte_perm (w[37], w[36], selector);
      w[36] = hc_byte_perm (w[36], w[35], selector);
      w[35] = hc_byte_perm (w[35], w[34], selector);
      w[34] = hc_byte_perm (w[34], w[33], selector);
      w[33] = hc_byte_perm (w[33], w[32], selector);
      w[32] = hc_byte_perm (w[32], w[31], selector);
      w[31] = hc_byte_perm (w[31], w[30], selector);
      w[30] = hc_byte_perm (w[30], w[29], selector);
      w[29] = hc_byte_perm (w[29], w[28], selector);
      w[28] = hc_byte_perm (w[28], w[27], selector);
      w[27] = hc_byte_perm (w[27], w[26], selector);
      w[26] = hc_byte_perm (w[26], w[25], selector);
      w[25] = hc_byte_perm (w[25], w[24], selector);
      w[24] = hc_byte_perm (w[24], w[23], selector);
      w[23] = hc_byte_perm (w[23], w[22], selector);
      w[22] = hc_byte_perm (w[22], w[21], selector);
      w[21] = hc_byte_perm (w[21], w[20], selector);
      w[20] = hc_byte_perm (w[20], w[19], selector);
      w[19] = hc_byte_perm (w[19], w[18], selector);
      w[18] = hc_byte_perm (w[18], w[17], selector);
      w[17] = hc_byte_perm (w[17], w[16], selector);
      w[16] = hc_byte_perm (w[16], w[15], selector);
      w[15] = hc_byte_perm (w[15], w[14], selector);
      w[14] = hc_byte_perm (w[14], w[13], selector);
      w[13] = hc_byte_perm (w[13], w[12], selector);
      w[12] = hc_byte_perm (w[12], w[11], selector);
      w[11] = hc_byte_perm (w[11], w[10], selector);
      w[10] = hc_byte_perm (w[10], w[ 9], selector);
      w[ 9] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[ 8] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[ 7] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[ 6] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[ 5] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 4] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 3] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 2] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 1] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 0] = hc_byte_perm (w[ 0],     0, selector);

      break;

    case  1:
      w[63] = hc_byte_perm (w[62], w[61], selector);
      w[62] = hc_byte_perm (w[61], w[60], selector);
      w[61] = hc_byte_perm (w[60], w[59], selector);
      w[60] = hc_byte_perm (w[59], w[58], selector);
      w[59] = hc_byte_perm (w[58], w[57], selector);
      w[58] = hc_byte_perm (w[57], w[56], selector);
      w[57] = hc_byte_perm (w[56], w[55], selector);
      w[56] = hc_byte_perm (w[55], w[54], selector);
      w[55] = hc_byte_perm (w[54], w[53], selector);
      w[54] = hc_byte_perm (w[53], w[52], selector);
      w[53] = hc_byte_perm (w[52], w[51], selector);
      w[52] = hc_byte_perm (w[51], w[50], selector);
      w[51] = hc_byte_perm (w[50], w[49], selector);
      w[50] = hc_byte_perm (w[49], w[48], selector);
      w[49] = hc_byte_perm (w[48], w[47], selector);
      w[48] = hc_byte_perm (w[47], w[46], selector);
      w[47] = hc_byte_perm (w[46], w[45], selector);
      w[46] = hc_byte_perm (w[45], w[44], selector);
      w[45] = hc_byte_perm (w[44], w[43], selector);
      w[44] = hc_byte_perm (w[43], w[42], selector);
      w[43] = hc_byte_perm (w[42], w[41], selector);
      w[42] = hc_byte_perm (w[41], w[40], selector);
      w[41] = hc_byte_perm (w[40], w[39], selector);
      w[40] = hc_byte_perm (w[39], w[38], selector);
      w[39] = hc_byte_perm (w[38], w[37], selector);
      w[38] = hc_byte_perm (w[37], w[36], selector);
      w[37] = hc_byte_perm (w[36], w[35], selector);
      w[36] = hc_byte_perm (w[35], w[34], selector);
      w[35] = hc_byte_perm (w[34], w[33], selector);
      w[34] = hc_byte_perm (w[33], w[32], selector);
      w[33] = hc_byte_perm (w[32], w[31], selector);
      w[32] = hc_byte_perm (w[31], w[30], selector);
      w[31] = hc_byte_perm (w[30], w[29], selector);
      w[30] = hc_byte_perm (w[29], w[28], selector);
      w[29] = hc_byte_perm (w[28], w[27], selector);
      w[28] = hc_byte_perm (w[27], w[26], selector);
      w[27] = hc_byte_perm (w[26], w[25], selector);
      w[26] = hc_byte_perm (w[25], w[24], selector);
      w[25] = hc_byte_perm (w[24], w[23], selector);
      w[24] = hc_byte_perm (w[23], w[22], selector);
      w[23] = hc_byte_perm (w[22], w[21], selector);
      w[22] = hc_byte_perm (w[21], w[20], selector);
      w[21] = hc_byte_perm (w[20], w[19], selector);
      w[20] = hc_byte_perm (w[19], w[18], selector);
      w[19] = hc_byte_perm (w[18], w[17], selector);
      w[18] = hc_byte_perm (w[17], w[16], selector);
      w[17] = hc_byte_perm (w[16], w[15], selector);
      w[16] = hc_byte_perm (w[15], w[14], selector);
      w[15] = hc_byte_perm (w[14], w[13], selector);
      w[14] = hc_byte_perm (w[13], w[12], selector);
      w[13] = hc_byte_perm (w[12], w[11], selector);
      w[12] = hc_byte_perm (w[11], w[10], selector);
      w[11] = hc_byte_perm (w[10], w[ 9], selector);
      w[10] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[ 9] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[ 8] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[ 7] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[ 6] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 5] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 4] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 3] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 2] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 1] = hc_byte_perm (w[ 0],     0, selector);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_byte_perm (w[61], w[60], selector);
      w[62] = hc_byte_perm (w[60], w[59], selector);
      w[61] = hc_byte_perm (w[59], w[58], selector);
      w[60] = hc_byte_perm (w[58], w[57], selector);
      w[59] = hc_byte_perm (w[57], w[56], selector);
      w[58] = hc_byte_perm (w[56], w[55], selector);
      w[57] = hc_byte_perm (w[55], w[54], selector);
      w[56] = hc_byte_perm (w[54], w[53], selector);
      w[55] = hc_byte_perm (w[53], w[52], selector);
      w[54] = hc_byte_perm (w[52], w[51], selector);
      w[53] = hc_byte_perm (w[51], w[50], selector);
      w[52] = hc_byte_perm (w[50], w[49], selector);
      w[51] = hc_byte_perm (w[49], w[48], selector);
      w[50] = hc_byte_perm (w[48], w[47], selector);
      w[49] = hc_byte_perm (w[47], w[46], selector);
      w[48] = hc_byte_perm (w[46], w[45], selector);
      w[47] = hc_byte_perm (w[45], w[44], selector);
      w[46] = hc_byte_perm (w[44], w[43], selector);
      w[45] = hc_byte_perm (w[43], w[42], selector);
      w[44] = hc_byte_perm (w[42], w[41], selector);
      w[43] = hc_byte_perm (w[41], w[40], selector);
      w[42] = hc_byte_perm (w[40], w[39], selector);
      w[41] = hc_byte_perm (w[39], w[38], selector);
      w[40] = hc_byte_perm (w[38], w[37], selector);
      w[39] = hc_byte_perm (w[37], w[36], selector);
      w[38] = hc_byte_perm (w[36], w[35], selector);
      w[37] = hc_byte_perm (w[35], w[34], selector);
      w[36] = hc_byte_perm (w[34], w[33], selector);
      w[35] = hc_byte_perm (w[33], w[32], selector);
      w[34] = hc_byte_perm (w[32], w[31], selector);
      w[33] = hc_byte_perm (w[31], w[30], selector);
      w[32] = hc_byte_perm (w[30], w[29], selector);
      w[31] = hc_byte_perm (w[29], w[28], selector);
      w[30] = hc_byte_perm (w[28], w[27], selector);
      w[29] = hc_byte_perm (w[27], w[26], selector);
      w[28] = hc_byte_perm (w[26], w[25], selector);
      w[27] = hc_byte_perm (w[25], w[24], selector);
      w[26] = hc_byte_perm (w[24], w[23], selector);
      w[25] = hc_byte_perm (w[23], w[22], selector);
      w[24] = hc_byte_perm (w[22], w[21], selector);
      w[23] = hc_byte_perm (w[21], w[20], selector);
      w[22] = hc_byte_perm (w[20], w[19], selector);
      w[21] = hc_byte_perm (w[19], w[18], selector);
      w[20] = hc_byte_perm (w[18], w[17], selector);
      w[19] = hc_byte_perm (w[17], w[16], selector);
      w[18] = hc_byte_perm (w[16], w[15], selector);
      w[17] = hc_byte_perm (w[15], w[14], selector);
      w[16] = hc_byte_perm (w[14], w[13], selector);
      w[15] = hc_byte_perm (w[13], w[12], selector);
      w[14] = hc_byte_perm (w[12], w[11], selector);
      w[13] = hc_byte_perm (w[11], w[10], selector);
      w[12] = hc_byte_perm (w[10], w[ 9], selector);
      w[11] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[10] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[ 9] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[ 8] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[ 7] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 6] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 5] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 4] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 3] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 2] = hc_byte_perm (w[ 0],     0, selector);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_byte_perm (w[60], w[59], selector);
      w[62] = hc_byte_perm (w[59], w[58], selector);
      w[61] = hc_byte_perm (w[58], w[57], selector);
      w[60] = hc_byte_perm (w[57], w[56], selector);
      w[59] = hc_byte_perm (w[56], w[55], selector);
      w[58] = hc_byte_perm (w[55], w[54], selector);
      w[57] = hc_byte_perm (w[54], w[53], selector);
      w[56] = hc_byte_perm (w[53], w[52], selector);
      w[55] = hc_byte_perm (w[52], w[51], selector);
      w[54] = hc_byte_perm (w[51], w[50], selector);
      w[53] = hc_byte_perm (w[50], w[49], selector);
      w[52] = hc_byte_perm (w[49], w[48], selector);
      w[51] = hc_byte_perm (w[48], w[47], selector);
      w[50] = hc_byte_perm (w[47], w[46], selector);
      w[49] = hc_byte_perm (w[46], w[45], selector);
      w[48] = hc_byte_perm (w[45], w[44], selector);
      w[47] = hc_byte_perm (w[44], w[43], selector);
      w[46] = hc_byte_perm (w[43], w[42], selector);
      w[45] = hc_byte_perm (w[42], w[41], selector);
      w[44] = hc_byte_perm (w[41], w[40], selector);
      w[43] = hc_byte_perm (w[40], w[39], selector);
      w[42] = hc_byte_perm (w[39], w[38], selector);
      w[41] = hc_byte_perm (w[38], w[37], selector);
      w[40] = hc_byte_perm (w[37], w[36], selector);
      w[39] = hc_byte_perm (w[36], w[35], selector);
      w[38] = hc_byte_perm (w[35], w[34], selector);
      w[37] = hc_byte_perm (w[34], w[33], selector);
      w[36] = hc_byte_perm (w[33], w[32], selector);
      w[35] = hc_byte_perm (w[32], w[31], selector);
      w[34] = hc_byte_perm (w[31], w[30], selector);
      w[33] = hc_byte_perm (w[30], w[29], selector);
      w[32] = hc_byte_perm (w[29], w[28], selector);
      w[31] = hc_byte_perm (w[28], w[27], selector);
      w[30] = hc_byte_perm (w[27], w[26], selector);
      w[29] = hc_byte_perm (w[26], w[25], selector);
      w[28] = hc_byte_perm (w[25], w[24], selector);
      w[27] = hc_byte_perm (w[24], w[23], selector);
      w[26] = hc_byte_perm (w[23], w[22], selector);
      w[25] = hc_byte_perm (w[22], w[21], selector);
      w[24] = hc_byte_perm (w[21], w[20], selector);
      w[23] = hc_byte_perm (w[20], w[19], selector);
      w[22] = hc_byte_perm (w[19], w[18], selector);
      w[21] = hc_byte_perm (w[18], w[17], selector);
      w[20] = hc_byte_perm (w[17], w[16], selector);
      w[19] = hc_byte_perm (w[16], w[15], selector);
      w[18] = hc_byte_perm (w[15], w[14], selector);
      w[17] = hc_byte_perm (w[14], w[13], selector);
      w[16] = hc_byte_perm (w[13], w[12], selector);
      w[15] = hc_byte_perm (w[12], w[11], selector);
      w[14] = hc_byte_perm (w[11], w[10], selector);
      w[13] = hc_byte_perm (w[10], w[ 9], selector);
      w[12] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[11] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[10] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[ 9] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[ 8] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 7] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 6] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 5] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 4] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 3] = hc_byte_perm (w[ 0],     0, selector);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_byte_perm (w[59], w[58], selector);
      w[62] = hc_byte_perm (w[58], w[57], selector);
      w[61] = hc_byte_perm (w[57], w[56], selector);
      w[60] = hc_byte_perm (w[56], w[55], selector);
      w[59] = hc_byte_perm (w[55], w[54], selector);
      w[58] = hc_byte_perm (w[54], w[53], selector);
      w[57] = hc_byte_perm (w[53], w[52], selector);
      w[56] = hc_byte_perm (w[52], w[51], selector);
      w[55] = hc_byte_perm (w[51], w[50], selector);
      w[54] = hc_byte_perm (w[50], w[49], selector);
      w[53] = hc_byte_perm (w[49], w[48], selector);
      w[52] = hc_byte_perm (w[48], w[47], selector);
      w[51] = hc_byte_perm (w[47], w[46], selector);
      w[50] = hc_byte_perm (w[46], w[45], selector);
      w[49] = hc_byte_perm (w[45], w[44], selector);
      w[48] = hc_byte_perm (w[44], w[43], selector);
      w[47] = hc_byte_perm (w[43], w[42], selector);
      w[46] = hc_byte_perm (w[42], w[41], selector);
      w[45] = hc_byte_perm (w[41], w[40], selector);
      w[44] = hc_byte_perm (w[40], w[39], selector);
      w[43] = hc_byte_perm (w[39], w[38], selector);
      w[42] = hc_byte_perm (w[38], w[37], selector);
      w[41] = hc_byte_perm (w[37], w[36], selector);
      w[40] = hc_byte_perm (w[36], w[35], selector);
      w[39] = hc_byte_perm (w[35], w[34], selector);
      w[38] = hc_byte_perm (w[34], w[33], selector);
      w[37] = hc_byte_perm (w[33], w[32], selector);
      w[36] = hc_byte_perm (w[32], w[31], selector);
      w[35] = hc_byte_perm (w[31], w[30], selector);
      w[34] = hc_byte_perm (w[30], w[29], selector);
      w[33] = hc_byte_perm (w[29], w[28], selector);
      w[32] = hc_byte_perm (w[28], w[27], selector);
      w[31] = hc_byte_perm (w[27], w[26], selector);
      w[30] = hc_byte_perm (w[26], w[25], selector);
      w[29] = hc_byte_perm (w[25], w[24], selector);
      w[28] = hc_byte_perm (w[24], w[23], selector);
      w[27] = hc_byte_perm (w[23], w[22], selector);
      w[26] = hc_byte_perm (w[22], w[21], selector);
      w[25] = hc_byte_perm (w[21], w[20], selector);
      w[24] = hc_byte_perm (w[20], w[19], selector);
      w[23] = hc_byte_perm (w[19], w[18], selector);
      w[22] = hc_byte_perm (w[18], w[17], selector);
      w[21] = hc_byte_perm (w[17], w[16], selector);
      w[20] = hc_byte_perm (w[16], w[15], selector);
      w[19] = hc_byte_perm (w[15], w[14], selector);
      w[18] = hc_byte_perm (w[14], w[13], selector);
      w[17] = hc_byte_perm (w[13], w[12], selector);
      w[16] = hc_byte_perm (w[12], w[11], selector);
      w[15] = hc_byte_perm (w[11], w[10], selector);
      w[14] = hc_byte_perm (w[10], w[ 9], selector);
      w[13] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[12] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[11] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[10] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[ 9] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 8] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 7] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 6] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 5] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 4] = hc_byte_perm (w[ 0],     0, selector);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_byte_perm (w[58], w[57], selector);
      w[62] = hc_byte_perm (w[57], w[56], selector);
      w[61] = hc_byte_perm (w[56], w[55], selector);
      w[60] = hc_byte_perm (w[55], w[54], selector);
      w[59] = hc_byte_perm (w[54], w[53], selector);
      w[58] = hc_byte_perm (w[53], w[52], selector);
      w[57] = hc_byte_perm (w[52], w[51], selector);
      w[56] = hc_byte_perm (w[51], w[50], selector);
      w[55] = hc_byte_perm (w[50], w[49], selector);
      w[54] = hc_byte_perm (w[49], w[48], selector);
      w[53] = hc_byte_perm (w[48], w[47], selector);
      w[52] = hc_byte_perm (w[47], w[46], selector);
      w[51] = hc_byte_perm (w[46], w[45], selector);
      w[50] = hc_byte_perm (w[45], w[44], selector);
      w[49] = hc_byte_perm (w[44], w[43], selector);
      w[48] = hc_byte_perm (w[43], w[42], selector);
      w[47] = hc_byte_perm (w[42], w[41], selector);
      w[46] = hc_byte_perm (w[41], w[40], selector);
      w[45] = hc_byte_perm (w[40], w[39], selector);
      w[44] = hc_byte_perm (w[39], w[38], selector);
      w[43] = hc_byte_perm (w[38], w[37], selector);
      w[42] = hc_byte_perm (w[37], w[36], selector);
      w[41] = hc_byte_perm (w[36], w[35], selector);
      w[40] = hc_byte_perm (w[35], w[34], selector);
      w[39] = hc_byte_perm (w[34], w[33], selector);
      w[38] = hc_byte_perm (w[33], w[32], selector);
      w[37] = hc_byte_perm (w[32], w[31], selector);
      w[36] = hc_byte_perm (w[31], w[30], selector);
      w[35] = hc_byte_perm (w[30], w[29], selector);
      w[34] = hc_byte_perm (w[29], w[28], selector);
      w[33] = hc_byte_perm (w[28], w[27], selector);
      w[32] = hc_byte_perm (w[27], w[26], selector);
      w[31] = hc_byte_perm (w[26], w[25], selector);
      w[30] = hc_byte_perm (w[25], w[24], selector);
      w[29] = hc_byte_perm (w[24], w[23], selector);
      w[28] = hc_byte_perm (w[23], w[22], selector);
      w[27] = hc_byte_perm (w[22], w[21], selector);
      w[26] = hc_byte_perm (w[21], w[20], selector);
      w[25] = hc_byte_perm (w[20], w[19], selector);
      w[24] = hc_byte_perm (w[19], w[18], selector);
      w[23] = hc_byte_perm (w[18], w[17], selector);
      w[22] = hc_byte_perm (w[17], w[16], selector);
      w[21] = hc_byte_perm (w[16], w[15], selector);
      w[20] = hc_byte_perm (w[15], w[14], selector);
      w[19] = hc_byte_perm (w[14], w[13], selector);
      w[18] = hc_byte_perm (w[13], w[12], selector);
      w[17] = hc_byte_perm (w[12], w[11], selector);
      w[16] = hc_byte_perm (w[11], w[10], selector);
      w[15] = hc_byte_perm (w[10], w[ 9], selector);
      w[14] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[13] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[12] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[11] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[10] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[ 9] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 8] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 7] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 6] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 5] = hc_byte_perm (w[ 0],     0, selector);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_byte_perm (w[57], w[56], selector);
      w[62] = hc_byte_perm (w[56], w[55], selector);
      w[61] = hc_byte_perm (w[55], w[54], selector);
      w[60] = hc_byte_perm (w[54], w[53], selector);
      w[59] = hc_byte_perm (w[53], w[52], selector);
      w[58] = hc_byte_perm (w[52], w[51], selector);
      w[57] = hc_byte_perm (w[51], w[50], selector);
      w[56] = hc_byte_perm (w[50], w[49], selector);
      w[55] = hc_byte_perm (w[49], w[48], selector);
      w[54] = hc_byte_perm (w[48], w[47], selector);
      w[53] = hc_byte_perm (w[47], w[46], selector);
      w[52] = hc_byte_perm (w[46], w[45], selector);
      w[51] = hc_byte_perm (w[45], w[44], selector);
      w[50] = hc_byte_perm (w[44], w[43], selector);
      w[49] = hc_byte_perm (w[43], w[42], selector);
      w[48] = hc_byte_perm (w[42], w[41], selector);
      w[47] = hc_byte_perm (w[41], w[40], selector);
      w[46] = hc_byte_perm (w[40], w[39], selector);
      w[45] = hc_byte_perm (w[39], w[38], selector);
      w[44] = hc_byte_perm (w[38], w[37], selector);
      w[43] = hc_byte_perm (w[37], w[36], selector);
      w[42] = hc_byte_perm (w[36], w[35], selector);
      w[41] = hc_byte_perm (w[35], w[34], selector);
      w[40] = hc_byte_perm (w[34], w[33], selector);
      w[39] = hc_byte_perm (w[33], w[32], selector);
      w[38] = hc_byte_perm (w[32], w[31], selector);
      w[37] = hc_byte_perm (w[31], w[30], selector);
      w[36] = hc_byte_perm (w[30], w[29], selector);
      w[35] = hc_byte_perm (w[29], w[28], selector);
      w[34] = hc_byte_perm (w[28], w[27], selector);
      w[33] = hc_byte_perm (w[27], w[26], selector);
      w[32] = hc_byte_perm (w[26], w[25], selector);
      w[31] = hc_byte_perm (w[25], w[24], selector);
      w[30] = hc_byte_perm (w[24], w[23], selector);
      w[29] = hc_byte_perm (w[23], w[22], selector);
      w[28] = hc_byte_perm (w[22], w[21], selector);
      w[27] = hc_byte_perm (w[21], w[20], selector);
      w[26] = hc_byte_perm (w[20], w[19], selector);
      w[25] = hc_byte_perm (w[19], w[18], selector);
      w[24] = hc_byte_perm (w[18], w[17], selector);
      w[23] = hc_byte_perm (w[17], w[16], selector);
      w[22] = hc_byte_perm (w[16], w[15], selector);
      w[21] = hc_byte_perm (w[15], w[14], selector);
      w[20] = hc_byte_perm (w[14], w[13], selector);
      w[19] = hc_byte_perm (w[13], w[12], selector);
      w[18] = hc_byte_perm (w[12], w[11], selector);
      w[17] = hc_byte_perm (w[11], w[10], selector);
      w[16] = hc_byte_perm (w[10], w[ 9], selector);
      w[15] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[14] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[13] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[12] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[11] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[10] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[ 9] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 8] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 7] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 6] = hc_byte_perm (w[ 0],     0, selector);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_byte_perm (w[56], w[55], selector);
      w[62] = hc_byte_perm (w[55], w[54], selector);
      w[61] = hc_byte_perm (w[54], w[53], selector);
      w[60] = hc_byte_perm (w[53], w[52], selector);
      w[59] = hc_byte_perm (w[52], w[51], selector);
      w[58] = hc_byte_perm (w[51], w[50], selector);
      w[57] = hc_byte_perm (w[50], w[49], selector);
      w[56] = hc_byte_perm (w[49], w[48], selector);
      w[55] = hc_byte_perm (w[48], w[47], selector);
      w[54] = hc_byte_perm (w[47], w[46], selector);
      w[53] = hc_byte_perm (w[46], w[45], selector);
      w[52] = hc_byte_perm (w[45], w[44], selector);
      w[51] = hc_byte_perm (w[44], w[43], selector);
      w[50] = hc_byte_perm (w[43], w[42], selector);
      w[49] = hc_byte_perm (w[42], w[41], selector);
      w[48] = hc_byte_perm (w[41], w[40], selector);
      w[47] = hc_byte_perm (w[40], w[39], selector);
      w[46] = hc_byte_perm (w[39], w[38], selector);
      w[45] = hc_byte_perm (w[38], w[37], selector);
      w[44] = hc_byte_perm (w[37], w[36], selector);
      w[43] = hc_byte_perm (w[36], w[35], selector);
      w[42] = hc_byte_perm (w[35], w[34], selector);
      w[41] = hc_byte_perm (w[34], w[33], selector);
      w[40] = hc_byte_perm (w[33], w[32], selector);
      w[39] = hc_byte_perm (w[32], w[31], selector);
      w[38] = hc_byte_perm (w[31], w[30], selector);
      w[37] = hc_byte_perm (w[30], w[29], selector);
      w[36] = hc_byte_perm (w[29], w[28], selector);
      w[35] = hc_byte_perm (w[28], w[27], selector);
      w[34] = hc_byte_perm (w[27], w[26], selector);
      w[33] = hc_byte_perm (w[26], w[25], selector);
      w[32] = hc_byte_perm (w[25], w[24], selector);
      w[31] = hc_byte_perm (w[24], w[23], selector);
      w[30] = hc_byte_perm (w[23], w[22], selector);
      w[29] = hc_byte_perm (w[22], w[21], selector);
      w[28] = hc_byte_perm (w[21], w[20], selector);
      w[27] = hc_byte_perm (w[20], w[19], selector);
      w[26] = hc_byte_perm (w[19], w[18], selector);
      w[25] = hc_byte_perm (w[18], w[17], selector);
      w[24] = hc_byte_perm (w[17], w[16], selector);
      w[23] = hc_byte_perm (w[16], w[15], selector);
      w[22] = hc_byte_perm (w[15], w[14], selector);
      w[21] = hc_byte_perm (w[14], w[13], selector);
      w[20] = hc_byte_perm (w[13], w[12], selector);
      w[19] = hc_byte_perm (w[12], w[11], selector);
      w[18] = hc_byte_perm (w[11], w[10], selector);
      w[17] = hc_byte_perm (w[10], w[ 9], selector);
      w[16] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[15] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[14] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[13] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[12] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[11] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[10] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[ 9] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 8] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 7] = hc_byte_perm (w[ 0],     0, selector);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_byte_perm (w[55], w[54], selector);
      w[62] = hc_byte_perm (w[54], w[53], selector);
      w[61] = hc_byte_perm (w[53], w[52], selector);
      w[60] = hc_byte_perm (w[52], w[51], selector);
      w[59] = hc_byte_perm (w[51], w[50], selector);
      w[58] = hc_byte_perm (w[50], w[49], selector);
      w[57] = hc_byte_perm (w[49], w[48], selector);
      w[56] = hc_byte_perm (w[48], w[47], selector);
      w[55] = hc_byte_perm (w[47], w[46], selector);
      w[54] = hc_byte_perm (w[46], w[45], selector);
      w[53] = hc_byte_perm (w[45], w[44], selector);
      w[52] = hc_byte_perm (w[44], w[43], selector);
      w[51] = hc_byte_perm (w[43], w[42], selector);
      w[50] = hc_byte_perm (w[42], w[41], selector);
      w[49] = hc_byte_perm (w[41], w[40], selector);
      w[48] = hc_byte_perm (w[40], w[39], selector);
      w[47] = hc_byte_perm (w[39], w[38], selector);
      w[46] = hc_byte_perm (w[38], w[37], selector);
      w[45] = hc_byte_perm (w[37], w[36], selector);
      w[44] = hc_byte_perm (w[36], w[35], selector);
      w[43] = hc_byte_perm (w[35], w[34], selector);
      w[42] = hc_byte_perm (w[34], w[33], selector);
      w[41] = hc_byte_perm (w[33], w[32], selector);
      w[40] = hc_byte_perm (w[32], w[31], selector);
      w[39] = hc_byte_perm (w[31], w[30], selector);
      w[38] = hc_byte_perm (w[30], w[29], selector);
      w[37] = hc_byte_perm (w[29], w[28], selector);
      w[36] = hc_byte_perm (w[28], w[27], selector);
      w[35] = hc_byte_perm (w[27], w[26], selector);
      w[34] = hc_byte_perm (w[26], w[25], selector);
      w[33] = hc_byte_perm (w[25], w[24], selector);
      w[32] = hc_byte_perm (w[24], w[23], selector);
      w[31] = hc_byte_perm (w[23], w[22], selector);
      w[30] = hc_byte_perm (w[22], w[21], selector);
      w[29] = hc_byte_perm (w[21], w[20], selector);
      w[28] = hc_byte_perm (w[20], w[19], selector);
      w[27] = hc_byte_perm (w[19], w[18], selector);
      w[26] = hc_byte_perm (w[18], w[17], selector);
      w[25] = hc_byte_perm (w[17], w[16], selector);
      w[24] = hc_byte_perm (w[16], w[15], selector);
      w[23] = hc_byte_perm (w[15], w[14], selector);
      w[22] = hc_byte_perm (w[14], w[13], selector);
      w[21] = hc_byte_perm (w[13], w[12], selector);
      w[20] = hc_byte_perm (w[12], w[11], selector);
      w[19] = hc_byte_perm (w[11], w[10], selector);
      w[18] = hc_byte_perm (w[10], w[ 9], selector);
      w[17] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[16] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[15] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[14] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[13] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[12] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[11] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[10] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[ 9] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 8] = hc_byte_perm (w[ 0],     0, selector);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_byte_perm (w[54], w[53], selector);
      w[62] = hc_byte_perm (w[53], w[52], selector);
      w[61] = hc_byte_perm (w[52], w[51], selector);
      w[60] = hc_byte_perm (w[51], w[50], selector);
      w[59] = hc_byte_perm (w[50], w[49], selector);
      w[58] = hc_byte_perm (w[49], w[48], selector);
      w[57] = hc_byte_perm (w[48], w[47], selector);
      w[56] = hc_byte_perm (w[47], w[46], selector);
      w[55] = hc_byte_perm (w[46], w[45], selector);
      w[54] = hc_byte_perm (w[45], w[44], selector);
      w[53] = hc_byte_perm (w[44], w[43], selector);
      w[52] = hc_byte_perm (w[43], w[42], selector);
      w[51] = hc_byte_perm (w[42], w[41], selector);
      w[50] = hc_byte_perm (w[41], w[40], selector);
      w[49] = hc_byte_perm (w[40], w[39], selector);
      w[48] = hc_byte_perm (w[39], w[38], selector);
      w[47] = hc_byte_perm (w[38], w[37], selector);
      w[46] = hc_byte_perm (w[37], w[36], selector);
      w[45] = hc_byte_perm (w[36], w[35], selector);
      w[44] = hc_byte_perm (w[35], w[34], selector);
      w[43] = hc_byte_perm (w[34], w[33], selector);
      w[42] = hc_byte_perm (w[33], w[32], selector);
      w[41] = hc_byte_perm (w[32], w[31], selector);
      w[40] = hc_byte_perm (w[31], w[30], selector);
      w[39] = hc_byte_perm (w[30], w[29], selector);
      w[38] = hc_byte_perm (w[29], w[28], selector);
      w[37] = hc_byte_perm (w[28], w[27], selector);
      w[36] = hc_byte_perm (w[27], w[26], selector);
      w[35] = hc_byte_perm (w[26], w[25], selector);
      w[34] = hc_byte_perm (w[25], w[24], selector);
      w[33] = hc_byte_perm (w[24], w[23], selector);
      w[32] = hc_byte_perm (w[23], w[22], selector);
      w[31] = hc_byte_perm (w[22], w[21], selector);
      w[30] = hc_byte_perm (w[21], w[20], selector);
      w[29] = hc_byte_perm (w[20], w[19], selector);
      w[28] = hc_byte_perm (w[19], w[18], selector);
      w[27] = hc_byte_perm (w[18], w[17], selector);
      w[26] = hc_byte_perm (w[17], w[16], selector);
      w[25] = hc_byte_perm (w[16], w[15], selector);
      w[24] = hc_byte_perm (w[15], w[14], selector);
      w[23] = hc_byte_perm (w[14], w[13], selector);
      w[22] = hc_byte_perm (w[13], w[12], selector);
      w[21] = hc_byte_perm (w[12], w[11], selector);
      w[20] = hc_byte_perm (w[11], w[10], selector);
      w[19] = hc_byte_perm (w[10], w[ 9], selector);
      w[18] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[17] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[16] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[15] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[14] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[13] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[12] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[11] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[10] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[ 9] = hc_byte_perm (w[ 0],     0, selector);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_byte_perm (w[53], w[52], selector);
      w[62] = hc_byte_perm (w[52], w[51], selector);
      w[61] = hc_byte_perm (w[51], w[50], selector);
      w[60] = hc_byte_perm (w[50], w[49], selector);
      w[59] = hc_byte_perm (w[49], w[48], selector);
      w[58] = hc_byte_perm (w[48], w[47], selector);
      w[57] = hc_byte_perm (w[47], w[46], selector);
      w[56] = hc_byte_perm (w[46], w[45], selector);
      w[55] = hc_byte_perm (w[45], w[44], selector);
      w[54] = hc_byte_perm (w[44], w[43], selector);
      w[53] = hc_byte_perm (w[43], w[42], selector);
      w[52] = hc_byte_perm (w[42], w[41], selector);
      w[51] = hc_byte_perm (w[41], w[40], selector);
      w[50] = hc_byte_perm (w[40], w[39], selector);
      w[49] = hc_byte_perm (w[39], w[38], selector);
      w[48] = hc_byte_perm (w[38], w[37], selector);
      w[47] = hc_byte_perm (w[37], w[36], selector);
      w[46] = hc_byte_perm (w[36], w[35], selector);
      w[45] = hc_byte_perm (w[35], w[34], selector);
      w[44] = hc_byte_perm (w[34], w[33], selector);
      w[43] = hc_byte_perm (w[33], w[32], selector);
      w[42] = hc_byte_perm (w[32], w[31], selector);
      w[41] = hc_byte_perm (w[31], w[30], selector);
      w[40] = hc_byte_perm (w[30], w[29], selector);
      w[39] = hc_byte_perm (w[29], w[28], selector);
      w[38] = hc_byte_perm (w[28], w[27], selector);
      w[37] = hc_byte_perm (w[27], w[26], selector);
      w[36] = hc_byte_perm (w[26], w[25], selector);
      w[35] = hc_byte_perm (w[25], w[24], selector);
      w[34] = hc_byte_perm (w[24], w[23], selector);
      w[33] = hc_byte_perm (w[23], w[22], selector);
      w[32] = hc_byte_perm (w[22], w[21], selector);
      w[31] = hc_byte_perm (w[21], w[20], selector);
      w[30] = hc_byte_perm (w[20], w[19], selector);
      w[29] = hc_byte_perm (w[19], w[18], selector);
      w[28] = hc_byte_perm (w[18], w[17], selector);
      w[27] = hc_byte_perm (w[17], w[16], selector);
      w[26] = hc_byte_perm (w[16], w[15], selector);
      w[25] = hc_byte_perm (w[15], w[14], selector);
      w[24] = hc_byte_perm (w[14], w[13], selector);
      w[23] = hc_byte_perm (w[13], w[12], selector);
      w[22] = hc_byte_perm (w[12], w[11], selector);
      w[21] = hc_byte_perm (w[11], w[10], selector);
      w[20] = hc_byte_perm (w[10], w[ 9], selector);
      w[19] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[18] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[17] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[16] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[15] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[14] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[13] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[12] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[11] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[10] = hc_byte_perm (w[ 0],     0, selector);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_byte_perm (w[52], w[51], selector);
      w[62] = hc_byte_perm (w[51], w[50], selector);
      w[61] = hc_byte_perm (w[50], w[49], selector);
      w[60] = hc_byte_perm (w[49], w[48], selector);
      w[59] = hc_byte_perm (w[48], w[47], selector);
      w[58] = hc_byte_perm (w[47], w[46], selector);
      w[57] = hc_byte_perm (w[46], w[45], selector);
      w[56] = hc_byte_perm (w[45], w[44], selector);
      w[55] = hc_byte_perm (w[44], w[43], selector);
      w[54] = hc_byte_perm (w[43], w[42], selector);
      w[53] = hc_byte_perm (w[42], w[41], selector);
      w[52] = hc_byte_perm (w[41], w[40], selector);
      w[51] = hc_byte_perm (w[40], w[39], selector);
      w[50] = hc_byte_perm (w[39], w[38], selector);
      w[49] = hc_byte_perm (w[38], w[37], selector);
      w[48] = hc_byte_perm (w[37], w[36], selector);
      w[47] = hc_byte_perm (w[36], w[35], selector);
      w[46] = hc_byte_perm (w[35], w[34], selector);
      w[45] = hc_byte_perm (w[34], w[33], selector);
      w[44] = hc_byte_perm (w[33], w[32], selector);
      w[43] = hc_byte_perm (w[32], w[31], selector);
      w[42] = hc_byte_perm (w[31], w[30], selector);
      w[41] = hc_byte_perm (w[30], w[29], selector);
      w[40] = hc_byte_perm (w[29], w[28], selector);
      w[39] = hc_byte_perm (w[28], w[27], selector);
      w[38] = hc_byte_perm (w[27], w[26], selector);
      w[37] = hc_byte_perm (w[26], w[25], selector);
      w[36] = hc_byte_perm (w[25], w[24], selector);
      w[35] = hc_byte_perm (w[24], w[23], selector);
      w[34] = hc_byte_perm (w[23], w[22], selector);
      w[33] = hc_byte_perm (w[22], w[21], selector);
      w[32] = hc_byte_perm (w[21], w[20], selector);
      w[31] = hc_byte_perm (w[20], w[19], selector);
      w[30] = hc_byte_perm (w[19], w[18], selector);
      w[29] = hc_byte_perm (w[18], w[17], selector);
      w[28] = hc_byte_perm (w[17], w[16], selector);
      w[27] = hc_byte_perm (w[16], w[15], selector);
      w[26] = hc_byte_perm (w[15], w[14], selector);
      w[25] = hc_byte_perm (w[14], w[13], selector);
      w[24] = hc_byte_perm (w[13], w[12], selector);
      w[23] = hc_byte_perm (w[12], w[11], selector);
      w[22] = hc_byte_perm (w[11], w[10], selector);
      w[21] = hc_byte_perm (w[10], w[ 9], selector);
      w[20] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[19] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[18] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[17] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[16] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[15] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[14] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[13] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[12] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[11] = hc_byte_perm (w[ 0],     0, selector);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_byte_perm (w[51], w[50], selector);
      w[62] = hc_byte_perm (w[50], w[49], selector);
      w[61] = hc_byte_perm (w[49], w[48], selector);
      w[60] = hc_byte_perm (w[48], w[47], selector);
      w[59] = hc_byte_perm (w[47], w[46], selector);
      w[58] = hc_byte_perm (w[46], w[45], selector);
      w[57] = hc_byte_perm (w[45], w[44], selector);
      w[56] = hc_byte_perm (w[44], w[43], selector);
      w[55] = hc_byte_perm (w[43], w[42], selector);
      w[54] = hc_byte_perm (w[42], w[41], selector);
      w[53] = hc_byte_perm (w[41], w[40], selector);
      w[52] = hc_byte_perm (w[40], w[39], selector);
      w[51] = hc_byte_perm (w[39], w[38], selector);
      w[50] = hc_byte_perm (w[38], w[37], selector);
      w[49] = hc_byte_perm (w[37], w[36], selector);
      w[48] = hc_byte_perm (w[36], w[35], selector);
      w[47] = hc_byte_perm (w[35], w[34], selector);
      w[46] = hc_byte_perm (w[34], w[33], selector);
      w[45] = hc_byte_perm (w[33], w[32], selector);
      w[44] = hc_byte_perm (w[32], w[31], selector);
      w[43] = hc_byte_perm (w[31], w[30], selector);
      w[42] = hc_byte_perm (w[30], w[29], selector);
      w[41] = hc_byte_perm (w[29], w[28], selector);
      w[40] = hc_byte_perm (w[28], w[27], selector);
      w[39] = hc_byte_perm (w[27], w[26], selector);
      w[38] = hc_byte_perm (w[26], w[25], selector);
      w[37] = hc_byte_perm (w[25], w[24], selector);
      w[36] = hc_byte_perm (w[24], w[23], selector);
      w[35] = hc_byte_perm (w[23], w[22], selector);
      w[34] = hc_byte_perm (w[22], w[21], selector);
      w[33] = hc_byte_perm (w[21], w[20], selector);
      w[32] = hc_byte_perm (w[20], w[19], selector);
      w[31] = hc_byte_perm (w[19], w[18], selector);
      w[30] = hc_byte_perm (w[18], w[17], selector);
      w[29] = hc_byte_perm (w[17], w[16], selector);
      w[28] = hc_byte_perm (w[16], w[15], selector);
      w[27] = hc_byte_perm (w[15], w[14], selector);
      w[26] = hc_byte_perm (w[14], w[13], selector);
      w[25] = hc_byte_perm (w[13], w[12], selector);
      w[24] = hc_byte_perm (w[12], w[11], selector);
      w[23] = hc_byte_perm (w[11], w[10], selector);
      w[22] = hc_byte_perm (w[10], w[ 9], selector);
      w[21] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[20] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[19] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[18] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[17] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[16] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[15] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[14] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[13] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[12] = hc_byte_perm (w[ 0],     0, selector);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_byte_perm (w[50], w[49], selector);
      w[62] = hc_byte_perm (w[49], w[48], selector);
      w[61] = hc_byte_perm (w[48], w[47], selector);
      w[60] = hc_byte_perm (w[47], w[46], selector);
      w[59] = hc_byte_perm (w[46], w[45], selector);
      w[58] = hc_byte_perm (w[45], w[44], selector);
      w[57] = hc_byte_perm (w[44], w[43], selector);
      w[56] = hc_byte_perm (w[43], w[42], selector);
      w[55] = hc_byte_perm (w[42], w[41], selector);
      w[54] = hc_byte_perm (w[41], w[40], selector);
      w[53] = hc_byte_perm (w[40], w[39], selector);
      w[52] = hc_byte_perm (w[39], w[38], selector);
      w[51] = hc_byte_perm (w[38], w[37], selector);
      w[50] = hc_byte_perm (w[37], w[36], selector);
      w[49] = hc_byte_perm (w[36], w[35], selector);
      w[48] = hc_byte_perm (w[35], w[34], selector);
      w[47] = hc_byte_perm (w[34], w[33], selector);
      w[46] = hc_byte_perm (w[33], w[32], selector);
      w[45] = hc_byte_perm (w[32], w[31], selector);
      w[44] = hc_byte_perm (w[31], w[30], selector);
      w[43] = hc_byte_perm (w[30], w[29], selector);
      w[42] = hc_byte_perm (w[29], w[28], selector);
      w[41] = hc_byte_perm (w[28], w[27], selector);
      w[40] = hc_byte_perm (w[27], w[26], selector);
      w[39] = hc_byte_perm (w[26], w[25], selector);
      w[38] = hc_byte_perm (w[25], w[24], selector);
      w[37] = hc_byte_perm (w[24], w[23], selector);
      w[36] = hc_byte_perm (w[23], w[22], selector);
      w[35] = hc_byte_perm (w[22], w[21], selector);
      w[34] = hc_byte_perm (w[21], w[20], selector);
      w[33] = hc_byte_perm (w[20], w[19], selector);
      w[32] = hc_byte_perm (w[19], w[18], selector);
      w[31] = hc_byte_perm (w[18], w[17], selector);
      w[30] = hc_byte_perm (w[17], w[16], selector);
      w[29] = hc_byte_perm (w[16], w[15], selector);
      w[28] = hc_byte_perm (w[15], w[14], selector);
      w[27] = hc_byte_perm (w[14], w[13], selector);
      w[26] = hc_byte_perm (w[13], w[12], selector);
      w[25] = hc_byte_perm (w[12], w[11], selector);
      w[24] = hc_byte_perm (w[11], w[10], selector);
      w[23] = hc_byte_perm (w[10], w[ 9], selector);
      w[22] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[21] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[20] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[19] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[18] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[17] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[16] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[15] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[14] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[13] = hc_byte_perm (w[ 0],     0, selector);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_byte_perm (w[49], w[48], selector);
      w[62] = hc_byte_perm (w[48], w[47], selector);
      w[61] = hc_byte_perm (w[47], w[46], selector);
      w[60] = hc_byte_perm (w[46], w[45], selector);
      w[59] = hc_byte_perm (w[45], w[44], selector);
      w[58] = hc_byte_perm (w[44], w[43], selector);
      w[57] = hc_byte_perm (w[43], w[42], selector);
      w[56] = hc_byte_perm (w[42], w[41], selector);
      w[55] = hc_byte_perm (w[41], w[40], selector);
      w[54] = hc_byte_perm (w[40], w[39], selector);
      w[53] = hc_byte_perm (w[39], w[38], selector);
      w[52] = hc_byte_perm (w[38], w[37], selector);
      w[51] = hc_byte_perm (w[37], w[36], selector);
      w[50] = hc_byte_perm (w[36], w[35], selector);
      w[49] = hc_byte_perm (w[35], w[34], selector);
      w[48] = hc_byte_perm (w[34], w[33], selector);
      w[47] = hc_byte_perm (w[33], w[32], selector);
      w[46] = hc_byte_perm (w[32], w[31], selector);
      w[45] = hc_byte_perm (w[31], w[30], selector);
      w[44] = hc_byte_perm (w[30], w[29], selector);
      w[43] = hc_byte_perm (w[29], w[28], selector);
      w[42] = hc_byte_perm (w[28], w[27], selector);
      w[41] = hc_byte_perm (w[27], w[26], selector);
      w[40] = hc_byte_perm (w[26], w[25], selector);
      w[39] = hc_byte_perm (w[25], w[24], selector);
      w[38] = hc_byte_perm (w[24], w[23], selector);
      w[37] = hc_byte_perm (w[23], w[22], selector);
      w[36] = hc_byte_perm (w[22], w[21], selector);
      w[35] = hc_byte_perm (w[21], w[20], selector);
      w[34] = hc_byte_perm (w[20], w[19], selector);
      w[33] = hc_byte_perm (w[19], w[18], selector);
      w[32] = hc_byte_perm (w[18], w[17], selector);
      w[31] = hc_byte_perm (w[17], w[16], selector);
      w[30] = hc_byte_perm (w[16], w[15], selector);
      w[29] = hc_byte_perm (w[15], w[14], selector);
      w[28] = hc_byte_perm (w[14], w[13], selector);
      w[27] = hc_byte_perm (w[13], w[12], selector);
      w[26] = hc_byte_perm (w[12], w[11], selector);
      w[25] = hc_byte_perm (w[11], w[10], selector);
      w[24] = hc_byte_perm (w[10], w[ 9], selector);
      w[23] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[22] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[21] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[20] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[19] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[18] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[17] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[16] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[15] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[14] = hc_byte_perm (w[ 0],     0, selector);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_byte_perm (w[48], w[47], selector);
      w[62] = hc_byte_perm (w[47], w[46], selector);
      w[61] = hc_byte_perm (w[46], w[45], selector);
      w[60] = hc_byte_perm (w[45], w[44], selector);
      w[59] = hc_byte_perm (w[44], w[43], selector);
      w[58] = hc_byte_perm (w[43], w[42], selector);
      w[57] = hc_byte_perm (w[42], w[41], selector);
      w[56] = hc_byte_perm (w[41], w[40], selector);
      w[55] = hc_byte_perm (w[40], w[39], selector);
      w[54] = hc_byte_perm (w[39], w[38], selector);
      w[53] = hc_byte_perm (w[38], w[37], selector);
      w[52] = hc_byte_perm (w[37], w[36], selector);
      w[51] = hc_byte_perm (w[36], w[35], selector);
      w[50] = hc_byte_perm (w[35], w[34], selector);
      w[49] = hc_byte_perm (w[34], w[33], selector);
      w[48] = hc_byte_perm (w[33], w[32], selector);
      w[47] = hc_byte_perm (w[32], w[31], selector);
      w[46] = hc_byte_perm (w[31], w[30], selector);
      w[45] = hc_byte_perm (w[30], w[29], selector);
      w[44] = hc_byte_perm (w[29], w[28], selector);
      w[43] = hc_byte_perm (w[28], w[27], selector);
      w[42] = hc_byte_perm (w[27], w[26], selector);
      w[41] = hc_byte_perm (w[26], w[25], selector);
      w[40] = hc_byte_perm (w[25], w[24], selector);
      w[39] = hc_byte_perm (w[24], w[23], selector);
      w[38] = hc_byte_perm (w[23], w[22], selector);
      w[37] = hc_byte_perm (w[22], w[21], selector);
      w[36] = hc_byte_perm (w[21], w[20], selector);
      w[35] = hc_byte_perm (w[20], w[19], selector);
      w[34] = hc_byte_perm (w[19], w[18], selector);
      w[33] = hc_byte_perm (w[18], w[17], selector);
      w[32] = hc_byte_perm (w[17], w[16], selector);
      w[31] = hc_byte_perm (w[16], w[15], selector);
      w[30] = hc_byte_perm (w[15], w[14], selector);
      w[29] = hc_byte_perm (w[14], w[13], selector);
      w[28] = hc_byte_perm (w[13], w[12], selector);
      w[27] = hc_byte_perm (w[12], w[11], selector);
      w[26] = hc_byte_perm (w[11], w[10], selector);
      w[25] = hc_byte_perm (w[10], w[ 9], selector);
      w[24] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[23] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[22] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[21] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[20] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[19] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[18] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[17] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[16] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[15] = hc_byte_perm (w[ 0],     0, selector);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_byte_perm (w[47], w[46], selector);
      w[62] = hc_byte_perm (w[46], w[45], selector);
      w[61] = hc_byte_perm (w[45], w[44], selector);
      w[60] = hc_byte_perm (w[44], w[43], selector);
      w[59] = hc_byte_perm (w[43], w[42], selector);
      w[58] = hc_byte_perm (w[42], w[41], selector);
      w[57] = hc_byte_perm (w[41], w[40], selector);
      w[56] = hc_byte_perm (w[40], w[39], selector);
      w[55] = hc_byte_perm (w[39], w[38], selector);
      w[54] = hc_byte_perm (w[38], w[37], selector);
      w[53] = hc_byte_perm (w[37], w[36], selector);
      w[52] = hc_byte_perm (w[36], w[35], selector);
      w[51] = hc_byte_perm (w[35], w[34], selector);
      w[50] = hc_byte_perm (w[34], w[33], selector);
      w[49] = hc_byte_perm (w[33], w[32], selector);
      w[48] = hc_byte_perm (w[32], w[31], selector);
      w[47] = hc_byte_perm (w[31], w[30], selector);
      w[46] = hc_byte_perm (w[30], w[29], selector);
      w[45] = hc_byte_perm (w[29], w[28], selector);
      w[44] = hc_byte_perm (w[28], w[27], selector);
      w[43] = hc_byte_perm (w[27], w[26], selector);
      w[42] = hc_byte_perm (w[26], w[25], selector);
      w[41] = hc_byte_perm (w[25], w[24], selector);
      w[40] = hc_byte_perm (w[24], w[23], selector);
      w[39] = hc_byte_perm (w[23], w[22], selector);
      w[38] = hc_byte_perm (w[22], w[21], selector);
      w[37] = hc_byte_perm (w[21], w[20], selector);
      w[36] = hc_byte_perm (w[20], w[19], selector);
      w[35] = hc_byte_perm (w[19], w[18], selector);
      w[34] = hc_byte_perm (w[18], w[17], selector);
      w[33] = hc_byte_perm (w[17], w[16], selector);
      w[32] = hc_byte_perm (w[16], w[15], selector);
      w[31] = hc_byte_perm (w[15], w[14], selector);
      w[30] = hc_byte_perm (w[14], w[13], selector);
      w[29] = hc_byte_perm (w[13], w[12], selector);
      w[28] = hc_byte_perm (w[12], w[11], selector);
      w[27] = hc_byte_perm (w[11], w[10], selector);
      w[26] = hc_byte_perm (w[10], w[ 9], selector);
      w[25] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[24] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[23] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[22] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[21] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[20] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[19] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[18] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[17] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[16] = hc_byte_perm (w[ 0],     0, selector);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_byte_perm (w[46], w[45], selector);
      w[62] = hc_byte_perm (w[45], w[44], selector);
      w[61] = hc_byte_perm (w[44], w[43], selector);
      w[60] = hc_byte_perm (w[43], w[42], selector);
      w[59] = hc_byte_perm (w[42], w[41], selector);
      w[58] = hc_byte_perm (w[41], w[40], selector);
      w[57] = hc_byte_perm (w[40], w[39], selector);
      w[56] = hc_byte_perm (w[39], w[38], selector);
      w[55] = hc_byte_perm (w[38], w[37], selector);
      w[54] = hc_byte_perm (w[37], w[36], selector);
      w[53] = hc_byte_perm (w[36], w[35], selector);
      w[52] = hc_byte_perm (w[35], w[34], selector);
      w[51] = hc_byte_perm (w[34], w[33], selector);
      w[50] = hc_byte_perm (w[33], w[32], selector);
      w[49] = hc_byte_perm (w[32], w[31], selector);
      w[48] = hc_byte_perm (w[31], w[30], selector);
      w[47] = hc_byte_perm (w[30], w[29], selector);
      w[46] = hc_byte_perm (w[29], w[28], selector);
      w[45] = hc_byte_perm (w[28], w[27], selector);
      w[44] = hc_byte_perm (w[27], w[26], selector);
      w[43] = hc_byte_perm (w[26], w[25], selector);
      w[42] = hc_byte_perm (w[25], w[24], selector);
      w[41] = hc_byte_perm (w[24], w[23], selector);
      w[40] = hc_byte_perm (w[23], w[22], selector);
      w[39] = hc_byte_perm (w[22], w[21], selector);
      w[38] = hc_byte_perm (w[21], w[20], selector);
      w[37] = hc_byte_perm (w[20], w[19], selector);
      w[36] = hc_byte_perm (w[19], w[18], selector);
      w[35] = hc_byte_perm (w[18], w[17], selector);
      w[34] = hc_byte_perm (w[17], w[16], selector);
      w[33] = hc_byte_perm (w[16], w[15], selector);
      w[32] = hc_byte_perm (w[15], w[14], selector);
      w[31] = hc_byte_perm (w[14], w[13], selector);
      w[30] = hc_byte_perm (w[13], w[12], selector);
      w[29] = hc_byte_perm (w[12], w[11], selector);
      w[28] = hc_byte_perm (w[11], w[10], selector);
      w[27] = hc_byte_perm (w[10], w[ 9], selector);
      w[26] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[25] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[24] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[23] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[22] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[21] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[20] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[19] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[18] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[17] = hc_byte_perm (w[ 0],     0, selector);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_byte_perm (w[45], w[44], selector);
      w[62] = hc_byte_perm (w[44], w[43], selector);
      w[61] = hc_byte_perm (w[43], w[42], selector);
      w[60] = hc_byte_perm (w[42], w[41], selector);
      w[59] = hc_byte_perm (w[41], w[40], selector);
      w[58] = hc_byte_perm (w[40], w[39], selector);
      w[57] = hc_byte_perm (w[39], w[38], selector);
      w[56] = hc_byte_perm (w[38], w[37], selector);
      w[55] = hc_byte_perm (w[37], w[36], selector);
      w[54] = hc_byte_perm (w[36], w[35], selector);
      w[53] = hc_byte_perm (w[35], w[34], selector);
      w[52] = hc_byte_perm (w[34], w[33], selector);
      w[51] = hc_byte_perm (w[33], w[32], selector);
      w[50] = hc_byte_perm (w[32], w[31], selector);
      w[49] = hc_byte_perm (w[31], w[30], selector);
      w[48] = hc_byte_perm (w[30], w[29], selector);
      w[47] = hc_byte_perm (w[29], w[28], selector);
      w[46] = hc_byte_perm (w[28], w[27], selector);
      w[45] = hc_byte_perm (w[27], w[26], selector);
      w[44] = hc_byte_perm (w[26], w[25], selector);
      w[43] = hc_byte_perm (w[25], w[24], selector);
      w[42] = hc_byte_perm (w[24], w[23], selector);
      w[41] = hc_byte_perm (w[23], w[22], selector);
      w[40] = hc_byte_perm (w[22], w[21], selector);
      w[39] = hc_byte_perm (w[21], w[20], selector);
      w[38] = hc_byte_perm (w[20], w[19], selector);
      w[37] = hc_byte_perm (w[19], w[18], selector);
      w[36] = hc_byte_perm (w[18], w[17], selector);
      w[35] = hc_byte_perm (w[17], w[16], selector);
      w[34] = hc_byte_perm (w[16], w[15], selector);
      w[33] = hc_byte_perm (w[15], w[14], selector);
      w[32] = hc_byte_perm (w[14], w[13], selector);
      w[31] = hc_byte_perm (w[13], w[12], selector);
      w[30] = hc_byte_perm (w[12], w[11], selector);
      w[29] = hc_byte_perm (w[11], w[10], selector);
      w[28] = hc_byte_perm (w[10], w[ 9], selector);
      w[27] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[26] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[25] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[24] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[23] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[22] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[21] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[20] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[19] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[18] = hc_byte_perm (w[ 0],     0, selector);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_byte_perm (w[44], w[43], selector);
      w[62] = hc_byte_perm (w[43], w[42], selector);
      w[61] = hc_byte_perm (w[42], w[41], selector);
      w[60] = hc_byte_perm (w[41], w[40], selector);
      w[59] = hc_byte_perm (w[40], w[39], selector);
      w[58] = hc_byte_perm (w[39], w[38], selector);
      w[57] = hc_byte_perm (w[38], w[37], selector);
      w[56] = hc_byte_perm (w[37], w[36], selector);
      w[55] = hc_byte_perm (w[36], w[35], selector);
      w[54] = hc_byte_perm (w[35], w[34], selector);
      w[53] = hc_byte_perm (w[34], w[33], selector);
      w[52] = hc_byte_perm (w[33], w[32], selector);
      w[51] = hc_byte_perm (w[32], w[31], selector);
      w[50] = hc_byte_perm (w[31], w[30], selector);
      w[49] = hc_byte_perm (w[30], w[29], selector);
      w[48] = hc_byte_perm (w[29], w[28], selector);
      w[47] = hc_byte_perm (w[28], w[27], selector);
      w[46] = hc_byte_perm (w[27], w[26], selector);
      w[45] = hc_byte_perm (w[26], w[25], selector);
      w[44] = hc_byte_perm (w[25], w[24], selector);
      w[43] = hc_byte_perm (w[24], w[23], selector);
      w[42] = hc_byte_perm (w[23], w[22], selector);
      w[41] = hc_byte_perm (w[22], w[21], selector);
      w[40] = hc_byte_perm (w[21], w[20], selector);
      w[39] = hc_byte_perm (w[20], w[19], selector);
      w[38] = hc_byte_perm (w[19], w[18], selector);
      w[37] = hc_byte_perm (w[18], w[17], selector);
      w[36] = hc_byte_perm (w[17], w[16], selector);
      w[35] = hc_byte_perm (w[16], w[15], selector);
      w[34] = hc_byte_perm (w[15], w[14], selector);
      w[33] = hc_byte_perm (w[14], w[13], selector);
      w[32] = hc_byte_perm (w[13], w[12], selector);
      w[31] = hc_byte_perm (w[12], w[11], selector);
      w[30] = hc_byte_perm (w[11], w[10], selector);
      w[29] = hc_byte_perm (w[10], w[ 9], selector);
      w[28] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[27] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[26] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[25] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[24] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[23] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[22] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[21] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[20] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[19] = hc_byte_perm (w[ 0],     0, selector);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_byte_perm (w[43], w[42], selector);
      w[62] = hc_byte_perm (w[42], w[41], selector);
      w[61] = hc_byte_perm (w[41], w[40], selector);
      w[60] = hc_byte_perm (w[40], w[39], selector);
      w[59] = hc_byte_perm (w[39], w[38], selector);
      w[58] = hc_byte_perm (w[38], w[37], selector);
      w[57] = hc_byte_perm (w[37], w[36], selector);
      w[56] = hc_byte_perm (w[36], w[35], selector);
      w[55] = hc_byte_perm (w[35], w[34], selector);
      w[54] = hc_byte_perm (w[34], w[33], selector);
      w[53] = hc_byte_perm (w[33], w[32], selector);
      w[52] = hc_byte_perm (w[32], w[31], selector);
      w[51] = hc_byte_perm (w[31], w[30], selector);
      w[50] = hc_byte_perm (w[30], w[29], selector);
      w[49] = hc_byte_perm (w[29], w[28], selector);
      w[48] = hc_byte_perm (w[28], w[27], selector);
      w[47] = hc_byte_perm (w[27], w[26], selector);
      w[46] = hc_byte_perm (w[26], w[25], selector);
      w[45] = hc_byte_perm (w[25], w[24], selector);
      w[44] = hc_byte_perm (w[24], w[23], selector);
      w[43] = hc_byte_perm (w[23], w[22], selector);
      w[42] = hc_byte_perm (w[22], w[21], selector);
      w[41] = hc_byte_perm (w[21], w[20], selector);
      w[40] = hc_byte_perm (w[20], w[19], selector);
      w[39] = hc_byte_perm (w[19], w[18], selector);
      w[38] = hc_byte_perm (w[18], w[17], selector);
      w[37] = hc_byte_perm (w[17], w[16], selector);
      w[36] = hc_byte_perm (w[16], w[15], selector);
      w[35] = hc_byte_perm (w[15], w[14], selector);
      w[34] = hc_byte_perm (w[14], w[13], selector);
      w[33] = hc_byte_perm (w[13], w[12], selector);
      w[32] = hc_byte_perm (w[12], w[11], selector);
      w[31] = hc_byte_perm (w[11], w[10], selector);
      w[30] = hc_byte_perm (w[10], w[ 9], selector);
      w[29] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[28] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[27] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[26] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[25] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[24] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[23] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[22] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[21] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[20] = hc_byte_perm (w[ 0],     0, selector);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_byte_perm (w[42], w[41], selector);
      w[62] = hc_byte_perm (w[41], w[40], selector);
      w[61] = hc_byte_perm (w[40], w[39], selector);
      w[60] = hc_byte_perm (w[39], w[38], selector);
      w[59] = hc_byte_perm (w[38], w[37], selector);
      w[58] = hc_byte_perm (w[37], w[36], selector);
      w[57] = hc_byte_perm (w[36], w[35], selector);
      w[56] = hc_byte_perm (w[35], w[34], selector);
      w[55] = hc_byte_perm (w[34], w[33], selector);
      w[54] = hc_byte_perm (w[33], w[32], selector);
      w[53] = hc_byte_perm (w[32], w[31], selector);
      w[52] = hc_byte_perm (w[31], w[30], selector);
      w[51] = hc_byte_perm (w[30], w[29], selector);
      w[50] = hc_byte_perm (w[29], w[28], selector);
      w[49] = hc_byte_perm (w[28], w[27], selector);
      w[48] = hc_byte_perm (w[27], w[26], selector);
      w[47] = hc_byte_perm (w[26], w[25], selector);
      w[46] = hc_byte_perm (w[25], w[24], selector);
      w[45] = hc_byte_perm (w[24], w[23], selector);
      w[44] = hc_byte_perm (w[23], w[22], selector);
      w[43] = hc_byte_perm (w[22], w[21], selector);
      w[42] = hc_byte_perm (w[21], w[20], selector);
      w[41] = hc_byte_perm (w[20], w[19], selector);
      w[40] = hc_byte_perm (w[19], w[18], selector);
      w[39] = hc_byte_perm (w[18], w[17], selector);
      w[38] = hc_byte_perm (w[17], w[16], selector);
      w[37] = hc_byte_perm (w[16], w[15], selector);
      w[36] = hc_byte_perm (w[15], w[14], selector);
      w[35] = hc_byte_perm (w[14], w[13], selector);
      w[34] = hc_byte_perm (w[13], w[12], selector);
      w[33] = hc_byte_perm (w[12], w[11], selector);
      w[32] = hc_byte_perm (w[11], w[10], selector);
      w[31] = hc_byte_perm (w[10], w[ 9], selector);
      w[30] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[29] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[28] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[27] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[26] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[25] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[24] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[23] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[22] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[21] = hc_byte_perm (w[ 0],     0, selector);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_byte_perm (w[41], w[40], selector);
      w[62] = hc_byte_perm (w[40], w[39], selector);
      w[61] = hc_byte_perm (w[39], w[38], selector);
      w[60] = hc_byte_perm (w[38], w[37], selector);
      w[59] = hc_byte_perm (w[37], w[36], selector);
      w[58] = hc_byte_perm (w[36], w[35], selector);
      w[57] = hc_byte_perm (w[35], w[34], selector);
      w[56] = hc_byte_perm (w[34], w[33], selector);
      w[55] = hc_byte_perm (w[33], w[32], selector);
      w[54] = hc_byte_perm (w[32], w[31], selector);
      w[53] = hc_byte_perm (w[31], w[30], selector);
      w[52] = hc_byte_perm (w[30], w[29], selector);
      w[51] = hc_byte_perm (w[29], w[28], selector);
      w[50] = hc_byte_perm (w[28], w[27], selector);
      w[49] = hc_byte_perm (w[27], w[26], selector);
      w[48] = hc_byte_perm (w[26], w[25], selector);
      w[47] = hc_byte_perm (w[25], w[24], selector);
      w[46] = hc_byte_perm (w[24], w[23], selector);
      w[45] = hc_byte_perm (w[23], w[22], selector);
      w[44] = hc_byte_perm (w[22], w[21], selector);
      w[43] = hc_byte_perm (w[21], w[20], selector);
      w[42] = hc_byte_perm (w[20], w[19], selector);
      w[41] = hc_byte_perm (w[19], w[18], selector);
      w[40] = hc_byte_perm (w[18], w[17], selector);
      w[39] = hc_byte_perm (w[17], w[16], selector);
      w[38] = hc_byte_perm (w[16], w[15], selector);
      w[37] = hc_byte_perm (w[15], w[14], selector);
      w[36] = hc_byte_perm (w[14], w[13], selector);
      w[35] = hc_byte_perm (w[13], w[12], selector);
      w[34] = hc_byte_perm (w[12], w[11], selector);
      w[33] = hc_byte_perm (w[11], w[10], selector);
      w[32] = hc_byte_perm (w[10], w[ 9], selector);
      w[31] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[30] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[29] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[28] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[27] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[26] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[25] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[24] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[23] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[22] = hc_byte_perm (w[ 0],     0, selector);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_byte_perm (w[40], w[39], selector);
      w[62] = hc_byte_perm (w[39], w[38], selector);
      w[61] = hc_byte_perm (w[38], w[37], selector);
      w[60] = hc_byte_perm (w[37], w[36], selector);
      w[59] = hc_byte_perm (w[36], w[35], selector);
      w[58] = hc_byte_perm (w[35], w[34], selector);
      w[57] = hc_byte_perm (w[34], w[33], selector);
      w[56] = hc_byte_perm (w[33], w[32], selector);
      w[55] = hc_byte_perm (w[32], w[31], selector);
      w[54] = hc_byte_perm (w[31], w[30], selector);
      w[53] = hc_byte_perm (w[30], w[29], selector);
      w[52] = hc_byte_perm (w[29], w[28], selector);
      w[51] = hc_byte_perm (w[28], w[27], selector);
      w[50] = hc_byte_perm (w[27], w[26], selector);
      w[49] = hc_byte_perm (w[26], w[25], selector);
      w[48] = hc_byte_perm (w[25], w[24], selector);
      w[47] = hc_byte_perm (w[24], w[23], selector);
      w[46] = hc_byte_perm (w[23], w[22], selector);
      w[45] = hc_byte_perm (w[22], w[21], selector);
      w[44] = hc_byte_perm (w[21], w[20], selector);
      w[43] = hc_byte_perm (w[20], w[19], selector);
      w[42] = hc_byte_perm (w[19], w[18], selector);
      w[41] = hc_byte_perm (w[18], w[17], selector);
      w[40] = hc_byte_perm (w[17], w[16], selector);
      w[39] = hc_byte_perm (w[16], w[15], selector);
      w[38] = hc_byte_perm (w[15], w[14], selector);
      w[37] = hc_byte_perm (w[14], w[13], selector);
      w[36] = hc_byte_perm (w[13], w[12], selector);
      w[35] = hc_byte_perm (w[12], w[11], selector);
      w[34] = hc_byte_perm (w[11], w[10], selector);
      w[33] = hc_byte_perm (w[10], w[ 9], selector);
      w[32] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[31] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[30] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[29] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[28] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[27] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[26] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[25] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[24] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[23] = hc_byte_perm (w[ 0],     0, selector);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_byte_perm (w[39], w[38], selector);
      w[62] = hc_byte_perm (w[38], w[37], selector);
      w[61] = hc_byte_perm (w[37], w[36], selector);
      w[60] = hc_byte_perm (w[36], w[35], selector);
      w[59] = hc_byte_perm (w[35], w[34], selector);
      w[58] = hc_byte_perm (w[34], w[33], selector);
      w[57] = hc_byte_perm (w[33], w[32], selector);
      w[56] = hc_byte_perm (w[32], w[31], selector);
      w[55] = hc_byte_perm (w[31], w[30], selector);
      w[54] = hc_byte_perm (w[30], w[29], selector);
      w[53] = hc_byte_perm (w[29], w[28], selector);
      w[52] = hc_byte_perm (w[28], w[27], selector);
      w[51] = hc_byte_perm (w[27], w[26], selector);
      w[50] = hc_byte_perm (w[26], w[25], selector);
      w[49] = hc_byte_perm (w[25], w[24], selector);
      w[48] = hc_byte_perm (w[24], w[23], selector);
      w[47] = hc_byte_perm (w[23], w[22], selector);
      w[46] = hc_byte_perm (w[22], w[21], selector);
      w[45] = hc_byte_perm (w[21], w[20], selector);
      w[44] = hc_byte_perm (w[20], w[19], selector);
      w[43] = hc_byte_perm (w[19], w[18], selector);
      w[42] = hc_byte_perm (w[18], w[17], selector);
      w[41] = hc_byte_perm (w[17], w[16], selector);
      w[40] = hc_byte_perm (w[16], w[15], selector);
      w[39] = hc_byte_perm (w[15], w[14], selector);
      w[38] = hc_byte_perm (w[14], w[13], selector);
      w[37] = hc_byte_perm (w[13], w[12], selector);
      w[36] = hc_byte_perm (w[12], w[11], selector);
      w[35] = hc_byte_perm (w[11], w[10], selector);
      w[34] = hc_byte_perm (w[10], w[ 9], selector);
      w[33] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[32] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[31] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[30] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[29] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[28] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[27] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[26] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[25] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[24] = hc_byte_perm (w[ 0],     0, selector);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_byte_perm (w[38], w[37], selector);
      w[62] = hc_byte_perm (w[37], w[36], selector);
      w[61] = hc_byte_perm (w[36], w[35], selector);
      w[60] = hc_byte_perm (w[35], w[34], selector);
      w[59] = hc_byte_perm (w[34], w[33], selector);
      w[58] = hc_byte_perm (w[33], w[32], selector);
      w[57] = hc_byte_perm (w[32], w[31], selector);
      w[56] = hc_byte_perm (w[31], w[30], selector);
      w[55] = hc_byte_perm (w[30], w[29], selector);
      w[54] = hc_byte_perm (w[29], w[28], selector);
      w[53] = hc_byte_perm (w[28], w[27], selector);
      w[52] = hc_byte_perm (w[27], w[26], selector);
      w[51] = hc_byte_perm (w[26], w[25], selector);
      w[50] = hc_byte_perm (w[25], w[24], selector);
      w[49] = hc_byte_perm (w[24], w[23], selector);
      w[48] = hc_byte_perm (w[23], w[22], selector);
      w[47] = hc_byte_perm (w[22], w[21], selector);
      w[46] = hc_byte_perm (w[21], w[20], selector);
      w[45] = hc_byte_perm (w[20], w[19], selector);
      w[44] = hc_byte_perm (w[19], w[18], selector);
      w[43] = hc_byte_perm (w[18], w[17], selector);
      w[42] = hc_byte_perm (w[17], w[16], selector);
      w[41] = hc_byte_perm (w[16], w[15], selector);
      w[40] = hc_byte_perm (w[15], w[14], selector);
      w[39] = hc_byte_perm (w[14], w[13], selector);
      w[38] = hc_byte_perm (w[13], w[12], selector);
      w[37] = hc_byte_perm (w[12], w[11], selector);
      w[36] = hc_byte_perm (w[11], w[10], selector);
      w[35] = hc_byte_perm (w[10], w[ 9], selector);
      w[34] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[33] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[32] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[31] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[30] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[29] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[28] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[27] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[26] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[25] = hc_byte_perm (w[ 0],     0, selector);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_byte_perm (w[37], w[36], selector);
      w[62] = hc_byte_perm (w[36], w[35], selector);
      w[61] = hc_byte_perm (w[35], w[34], selector);
      w[60] = hc_byte_perm (w[34], w[33], selector);
      w[59] = hc_byte_perm (w[33], w[32], selector);
      w[58] = hc_byte_perm (w[32], w[31], selector);
      w[57] = hc_byte_perm (w[31], w[30], selector);
      w[56] = hc_byte_perm (w[30], w[29], selector);
      w[55] = hc_byte_perm (w[29], w[28], selector);
      w[54] = hc_byte_perm (w[28], w[27], selector);
      w[53] = hc_byte_perm (w[27], w[26], selector);
      w[52] = hc_byte_perm (w[26], w[25], selector);
      w[51] = hc_byte_perm (w[25], w[24], selector);
      w[50] = hc_byte_perm (w[24], w[23], selector);
      w[49] = hc_byte_perm (w[23], w[22], selector);
      w[48] = hc_byte_perm (w[22], w[21], selector);
      w[47] = hc_byte_perm (w[21], w[20], selector);
      w[46] = hc_byte_perm (w[20], w[19], selector);
      w[45] = hc_byte_perm (w[19], w[18], selector);
      w[44] = hc_byte_perm (w[18], w[17], selector);
      w[43] = hc_byte_perm (w[17], w[16], selector);
      w[42] = hc_byte_perm (w[16], w[15], selector);
      w[41] = hc_byte_perm (w[15], w[14], selector);
      w[40] = hc_byte_perm (w[14], w[13], selector);
      w[39] = hc_byte_perm (w[13], w[12], selector);
      w[38] = hc_byte_perm (w[12], w[11], selector);
      w[37] = hc_byte_perm (w[11], w[10], selector);
      w[36] = hc_byte_perm (w[10], w[ 9], selector);
      w[35] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[34] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[33] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[32] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[31] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[30] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[29] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[28] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[27] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[26] = hc_byte_perm (w[ 0],     0, selector);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_byte_perm (w[36], w[35], selector);
      w[62] = hc_byte_perm (w[35], w[34], selector);
      w[61] = hc_byte_perm (w[34], w[33], selector);
      w[60] = hc_byte_perm (w[33], w[32], selector);
      w[59] = hc_byte_perm (w[32], w[31], selector);
      w[58] = hc_byte_perm (w[31], w[30], selector);
      w[57] = hc_byte_perm (w[30], w[29], selector);
      w[56] = hc_byte_perm (w[29], w[28], selector);
      w[55] = hc_byte_perm (w[28], w[27], selector);
      w[54] = hc_byte_perm (w[27], w[26], selector);
      w[53] = hc_byte_perm (w[26], w[25], selector);
      w[52] = hc_byte_perm (w[25], w[24], selector);
      w[51] = hc_byte_perm (w[24], w[23], selector);
      w[50] = hc_byte_perm (w[23], w[22], selector);
      w[49] = hc_byte_perm (w[22], w[21], selector);
      w[48] = hc_byte_perm (w[21], w[20], selector);
      w[47] = hc_byte_perm (w[20], w[19], selector);
      w[46] = hc_byte_perm (w[19], w[18], selector);
      w[45] = hc_byte_perm (w[18], w[17], selector);
      w[44] = hc_byte_perm (w[17], w[16], selector);
      w[43] = hc_byte_perm (w[16], w[15], selector);
      w[42] = hc_byte_perm (w[15], w[14], selector);
      w[41] = hc_byte_perm (w[14], w[13], selector);
      w[40] = hc_byte_perm (w[13], w[12], selector);
      w[39] = hc_byte_perm (w[12], w[11], selector);
      w[38] = hc_byte_perm (w[11], w[10], selector);
      w[37] = hc_byte_perm (w[10], w[ 9], selector);
      w[36] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[35] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[34] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[33] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[32] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[31] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[30] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[29] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[28] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[27] = hc_byte_perm (w[ 0],     0, selector);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_byte_perm (w[35], w[34], selector);
      w[62] = hc_byte_perm (w[34], w[33], selector);
      w[61] = hc_byte_perm (w[33], w[32], selector);
      w[60] = hc_byte_perm (w[32], w[31], selector);
      w[59] = hc_byte_perm (w[31], w[30], selector);
      w[58] = hc_byte_perm (w[30], w[29], selector);
      w[57] = hc_byte_perm (w[29], w[28], selector);
      w[56] = hc_byte_perm (w[28], w[27], selector);
      w[55] = hc_byte_perm (w[27], w[26], selector);
      w[54] = hc_byte_perm (w[26], w[25], selector);
      w[53] = hc_byte_perm (w[25], w[24], selector);
      w[52] = hc_byte_perm (w[24], w[23], selector);
      w[51] = hc_byte_perm (w[23], w[22], selector);
      w[50] = hc_byte_perm (w[22], w[21], selector);
      w[49] = hc_byte_perm (w[21], w[20], selector);
      w[48] = hc_byte_perm (w[20], w[19], selector);
      w[47] = hc_byte_perm (w[19], w[18], selector);
      w[46] = hc_byte_perm (w[18], w[17], selector);
      w[45] = hc_byte_perm (w[17], w[16], selector);
      w[44] = hc_byte_perm (w[16], w[15], selector);
      w[43] = hc_byte_perm (w[15], w[14], selector);
      w[42] = hc_byte_perm (w[14], w[13], selector);
      w[41] = hc_byte_perm (w[13], w[12], selector);
      w[40] = hc_byte_perm (w[12], w[11], selector);
      w[39] = hc_byte_perm (w[11], w[10], selector);
      w[38] = hc_byte_perm (w[10], w[ 9], selector);
      w[37] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[36] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[35] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[34] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[33] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[32] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[31] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[30] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[29] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[28] = hc_byte_perm (w[ 0],     0, selector);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_byte_perm (w[34], w[33], selector);
      w[62] = hc_byte_perm (w[33], w[32], selector);
      w[61] = hc_byte_perm (w[32], w[31], selector);
      w[60] = hc_byte_perm (w[31], w[30], selector);
      w[59] = hc_byte_perm (w[30], w[29], selector);
      w[58] = hc_byte_perm (w[29], w[28], selector);
      w[57] = hc_byte_perm (w[28], w[27], selector);
      w[56] = hc_byte_perm (w[27], w[26], selector);
      w[55] = hc_byte_perm (w[26], w[25], selector);
      w[54] = hc_byte_perm (w[25], w[24], selector);
      w[53] = hc_byte_perm (w[24], w[23], selector);
      w[52] = hc_byte_perm (w[23], w[22], selector);
      w[51] = hc_byte_perm (w[22], w[21], selector);
      w[50] = hc_byte_perm (w[21], w[20], selector);
      w[49] = hc_byte_perm (w[20], w[19], selector);
      w[48] = hc_byte_perm (w[19], w[18], selector);
      w[47] = hc_byte_perm (w[18], w[17], selector);
      w[46] = hc_byte_perm (w[17], w[16], selector);
      w[45] = hc_byte_perm (w[16], w[15], selector);
      w[44] = hc_byte_perm (w[15], w[14], selector);
      w[43] = hc_byte_perm (w[14], w[13], selector);
      w[42] = hc_byte_perm (w[13], w[12], selector);
      w[41] = hc_byte_perm (w[12], w[11], selector);
      w[40] = hc_byte_perm (w[11], w[10], selector);
      w[39] = hc_byte_perm (w[10], w[ 9], selector);
      w[38] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[37] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[36] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[35] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[34] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[33] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[32] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[31] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[30] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[29] = hc_byte_perm (w[ 0],     0, selector);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_byte_perm (w[33], w[32], selector);
      w[62] = hc_byte_perm (w[32], w[31], selector);
      w[61] = hc_byte_perm (w[31], w[30], selector);
      w[60] = hc_byte_perm (w[30], w[29], selector);
      w[59] = hc_byte_perm (w[29], w[28], selector);
      w[58] = hc_byte_perm (w[28], w[27], selector);
      w[57] = hc_byte_perm (w[27], w[26], selector);
      w[56] = hc_byte_perm (w[26], w[25], selector);
      w[55] = hc_byte_perm (w[25], w[24], selector);
      w[54] = hc_byte_perm (w[24], w[23], selector);
      w[53] = hc_byte_perm (w[23], w[22], selector);
      w[52] = hc_byte_perm (w[22], w[21], selector);
      w[51] = hc_byte_perm (w[21], w[20], selector);
      w[50] = hc_byte_perm (w[20], w[19], selector);
      w[49] = hc_byte_perm (w[19], w[18], selector);
      w[48] = hc_byte_perm (w[18], w[17], selector);
      w[47] = hc_byte_perm (w[17], w[16], selector);
      w[46] = hc_byte_perm (w[16], w[15], selector);
      w[45] = hc_byte_perm (w[15], w[14], selector);
      w[44] = hc_byte_perm (w[14], w[13], selector);
      w[43] = hc_byte_perm (w[13], w[12], selector);
      w[42] = hc_byte_perm (w[12], w[11], selector);
      w[41] = hc_byte_perm (w[11], w[10], selector);
      w[40] = hc_byte_perm (w[10], w[ 9], selector);
      w[39] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[38] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[37] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[36] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[35] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[34] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[33] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[32] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[31] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[30] = hc_byte_perm (w[ 0],     0, selector);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_byte_perm (w[32], w[31], selector);
      w[62] = hc_byte_perm (w[31], w[30], selector);
      w[61] = hc_byte_perm (w[30], w[29], selector);
      w[60] = hc_byte_perm (w[29], w[28], selector);
      w[59] = hc_byte_perm (w[28], w[27], selector);
      w[58] = hc_byte_perm (w[27], w[26], selector);
      w[57] = hc_byte_perm (w[26], w[25], selector);
      w[56] = hc_byte_perm (w[25], w[24], selector);
      w[55] = hc_byte_perm (w[24], w[23], selector);
      w[54] = hc_byte_perm (w[23], w[22], selector);
      w[53] = hc_byte_perm (w[22], w[21], selector);
      w[52] = hc_byte_perm (w[21], w[20], selector);
      w[51] = hc_byte_perm (w[20], w[19], selector);
      w[50] = hc_byte_perm (w[19], w[18], selector);
      w[49] = hc_byte_perm (w[18], w[17], selector);
      w[48] = hc_byte_perm (w[17], w[16], selector);
      w[47] = hc_byte_perm (w[16], w[15], selector);
      w[46] = hc_byte_perm (w[15], w[14], selector);
      w[45] = hc_byte_perm (w[14], w[13], selector);
      w[44] = hc_byte_perm (w[13], w[12], selector);
      w[43] = hc_byte_perm (w[12], w[11], selector);
      w[42] = hc_byte_perm (w[11], w[10], selector);
      w[41] = hc_byte_perm (w[10], w[ 9], selector);
      w[40] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[39] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[38] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[37] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[36] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[35] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[34] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[33] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[32] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[31] = hc_byte_perm (w[ 0],     0, selector);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_byte_perm (w[31], w[30], selector);
      w[62] = hc_byte_perm (w[30], w[29], selector);
      w[61] = hc_byte_perm (w[29], w[28], selector);
      w[60] = hc_byte_perm (w[28], w[27], selector);
      w[59] = hc_byte_perm (w[27], w[26], selector);
      w[58] = hc_byte_perm (w[26], w[25], selector);
      w[57] = hc_byte_perm (w[25], w[24], selector);
      w[56] = hc_byte_perm (w[24], w[23], selector);
      w[55] = hc_byte_perm (w[23], w[22], selector);
      w[54] = hc_byte_perm (w[22], w[21], selector);
      w[53] = hc_byte_perm (w[21], w[20], selector);
      w[52] = hc_byte_perm (w[20], w[19], selector);
      w[51] = hc_byte_perm (w[19], w[18], selector);
      w[50] = hc_byte_perm (w[18], w[17], selector);
      w[49] = hc_byte_perm (w[17], w[16], selector);
      w[48] = hc_byte_perm (w[16], w[15], selector);
      w[47] = hc_byte_perm (w[15], w[14], selector);
      w[46] = hc_byte_perm (w[14], w[13], selector);
      w[45] = hc_byte_perm (w[13], w[12], selector);
      w[44] = hc_byte_perm (w[12], w[11], selector);
      w[43] = hc_byte_perm (w[11], w[10], selector);
      w[42] = hc_byte_perm (w[10], w[ 9], selector);
      w[41] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[40] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[39] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[38] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[37] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[36] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[35] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[34] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[33] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[32] = hc_byte_perm (w[ 0],     0, selector);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_byte_perm (w[30], w[29], selector);
      w[62] = hc_byte_perm (w[29], w[28], selector);
      w[61] = hc_byte_perm (w[28], w[27], selector);
      w[60] = hc_byte_perm (w[27], w[26], selector);
      w[59] = hc_byte_perm (w[26], w[25], selector);
      w[58] = hc_byte_perm (w[25], w[24], selector);
      w[57] = hc_byte_perm (w[24], w[23], selector);
      w[56] = hc_byte_perm (w[23], w[22], selector);
      w[55] = hc_byte_perm (w[22], w[21], selector);
      w[54] = hc_byte_perm (w[21], w[20], selector);
      w[53] = hc_byte_perm (w[20], w[19], selector);
      w[52] = hc_byte_perm (w[19], w[18], selector);
      w[51] = hc_byte_perm (w[18], w[17], selector);
      w[50] = hc_byte_perm (w[17], w[16], selector);
      w[49] = hc_byte_perm (w[16], w[15], selector);
      w[48] = hc_byte_perm (w[15], w[14], selector);
      w[47] = hc_byte_perm (w[14], w[13], selector);
      w[46] = hc_byte_perm (w[13], w[12], selector);
      w[45] = hc_byte_perm (w[12], w[11], selector);
      w[44] = hc_byte_perm (w[11], w[10], selector);
      w[43] = hc_byte_perm (w[10], w[ 9], selector);
      w[42] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[41] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[40] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[39] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[38] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[37] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[36] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[35] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[34] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[33] = hc_byte_perm (w[ 0],     0, selector);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_byte_perm (w[29], w[28], selector);
      w[62] = hc_byte_perm (w[28], w[27], selector);
      w[61] = hc_byte_perm (w[27], w[26], selector);
      w[60] = hc_byte_perm (w[26], w[25], selector);
      w[59] = hc_byte_perm (w[25], w[24], selector);
      w[58] = hc_byte_perm (w[24], w[23], selector);
      w[57] = hc_byte_perm (w[23], w[22], selector);
      w[56] = hc_byte_perm (w[22], w[21], selector);
      w[55] = hc_byte_perm (w[21], w[20], selector);
      w[54] = hc_byte_perm (w[20], w[19], selector);
      w[53] = hc_byte_perm (w[19], w[18], selector);
      w[52] = hc_byte_perm (w[18], w[17], selector);
      w[51] = hc_byte_perm (w[17], w[16], selector);
      w[50] = hc_byte_perm (w[16], w[15], selector);
      w[49] = hc_byte_perm (w[15], w[14], selector);
      w[48] = hc_byte_perm (w[14], w[13], selector);
      w[47] = hc_byte_perm (w[13], w[12], selector);
      w[46] = hc_byte_perm (w[12], w[11], selector);
      w[45] = hc_byte_perm (w[11], w[10], selector);
      w[44] = hc_byte_perm (w[10], w[ 9], selector);
      w[43] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[42] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[41] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[40] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[39] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[38] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[37] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[36] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[35] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[34] = hc_byte_perm (w[ 0],     0, selector);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_byte_perm (w[28], w[27], selector);
      w[62] = hc_byte_perm (w[27], w[26], selector);
      w[61] = hc_byte_perm (w[26], w[25], selector);
      w[60] = hc_byte_perm (w[25], w[24], selector);
      w[59] = hc_byte_perm (w[24], w[23], selector);
      w[58] = hc_byte_perm (w[23], w[22], selector);
      w[57] = hc_byte_perm (w[22], w[21], selector);
      w[56] = hc_byte_perm (w[21], w[20], selector);
      w[55] = hc_byte_perm (w[20], w[19], selector);
      w[54] = hc_byte_perm (w[19], w[18], selector);
      w[53] = hc_byte_perm (w[18], w[17], selector);
      w[52] = hc_byte_perm (w[17], w[16], selector);
      w[51] = hc_byte_perm (w[16], w[15], selector);
      w[50] = hc_byte_perm (w[15], w[14], selector);
      w[49] = hc_byte_perm (w[14], w[13], selector);
      w[48] = hc_byte_perm (w[13], w[12], selector);
      w[47] = hc_byte_perm (w[12], w[11], selector);
      w[46] = hc_byte_perm (w[11], w[10], selector);
      w[45] = hc_byte_perm (w[10], w[ 9], selector);
      w[44] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[43] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[42] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[41] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[40] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[39] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[38] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[37] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[36] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[35] = hc_byte_perm (w[ 0],     0, selector);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_byte_perm (w[27], w[26], selector);
      w[62] = hc_byte_perm (w[26], w[25], selector);
      w[61] = hc_byte_perm (w[25], w[24], selector);
      w[60] = hc_byte_perm (w[24], w[23], selector);
      w[59] = hc_byte_perm (w[23], w[22], selector);
      w[58] = hc_byte_perm (w[22], w[21], selector);
      w[57] = hc_byte_perm (w[21], w[20], selector);
      w[56] = hc_byte_perm (w[20], w[19], selector);
      w[55] = hc_byte_perm (w[19], w[18], selector);
      w[54] = hc_byte_perm (w[18], w[17], selector);
      w[53] = hc_byte_perm (w[17], w[16], selector);
      w[52] = hc_byte_perm (w[16], w[15], selector);
      w[51] = hc_byte_perm (w[15], w[14], selector);
      w[50] = hc_byte_perm (w[14], w[13], selector);
      w[49] = hc_byte_perm (w[13], w[12], selector);
      w[48] = hc_byte_perm (w[12], w[11], selector);
      w[47] = hc_byte_perm (w[11], w[10], selector);
      w[46] = hc_byte_perm (w[10], w[ 9], selector);
      w[45] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[44] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[43] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[42] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[41] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[40] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[39] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[38] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[37] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[36] = hc_byte_perm (w[ 0],     0, selector);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_byte_perm (w[26], w[25], selector);
      w[62] = hc_byte_perm (w[25], w[24], selector);
      w[61] = hc_byte_perm (w[24], w[23], selector);
      w[60] = hc_byte_perm (w[23], w[22], selector);
      w[59] = hc_byte_perm (w[22], w[21], selector);
      w[58] = hc_byte_perm (w[21], w[20], selector);
      w[57] = hc_byte_perm (w[20], w[19], selector);
      w[56] = hc_byte_perm (w[19], w[18], selector);
      w[55] = hc_byte_perm (w[18], w[17], selector);
      w[54] = hc_byte_perm (w[17], w[16], selector);
      w[53] = hc_byte_perm (w[16], w[15], selector);
      w[52] = hc_byte_perm (w[15], w[14], selector);
      w[51] = hc_byte_perm (w[14], w[13], selector);
      w[50] = hc_byte_perm (w[13], w[12], selector);
      w[49] = hc_byte_perm (w[12], w[11], selector);
      w[48] = hc_byte_perm (w[11], w[10], selector);
      w[47] = hc_byte_perm (w[10], w[ 9], selector);
      w[46] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[45] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[44] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[43] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[42] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[41] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[40] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[39] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[38] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[37] = hc_byte_perm (w[ 0],     0, selector);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_byte_perm (w[25], w[24], selector);
      w[62] = hc_byte_perm (w[24], w[23], selector);
      w[61] = hc_byte_perm (w[23], w[22], selector);
      w[60] = hc_byte_perm (w[22], w[21], selector);
      w[59] = hc_byte_perm (w[21], w[20], selector);
      w[58] = hc_byte_perm (w[20], w[19], selector);
      w[57] = hc_byte_perm (w[19], w[18], selector);
      w[56] = hc_byte_perm (w[18], w[17], selector);
      w[55] = hc_byte_perm (w[17], w[16], selector);
      w[54] = hc_byte_perm (w[16], w[15], selector);
      w[53] = hc_byte_perm (w[15], w[14], selector);
      w[52] = hc_byte_perm (w[14], w[13], selector);
      w[51] = hc_byte_perm (w[13], w[12], selector);
      w[50] = hc_byte_perm (w[12], w[11], selector);
      w[49] = hc_byte_perm (w[11], w[10], selector);
      w[48] = hc_byte_perm (w[10], w[ 9], selector);
      w[47] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[46] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[45] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[44] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[43] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[42] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[41] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[40] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[39] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[38] = hc_byte_perm (w[ 0],     0, selector);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_byte_perm (w[24], w[23], selector);
      w[62] = hc_byte_perm (w[23], w[22], selector);
      w[61] = hc_byte_perm (w[22], w[21], selector);
      w[60] = hc_byte_perm (w[21], w[20], selector);
      w[59] = hc_byte_perm (w[20], w[19], selector);
      w[58] = hc_byte_perm (w[19], w[18], selector);
      w[57] = hc_byte_perm (w[18], w[17], selector);
      w[56] = hc_byte_perm (w[17], w[16], selector);
      w[55] = hc_byte_perm (w[16], w[15], selector);
      w[54] = hc_byte_perm (w[15], w[14], selector);
      w[53] = hc_byte_perm (w[14], w[13], selector);
      w[52] = hc_byte_perm (w[13], w[12], selector);
      w[51] = hc_byte_perm (w[12], w[11], selector);
      w[50] = hc_byte_perm (w[11], w[10], selector);
      w[49] = hc_byte_perm (w[10], w[ 9], selector);
      w[48] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[47] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[46] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[45] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[44] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[43] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[42] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[41] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[40] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[39] = hc_byte_perm (w[ 0],     0, selector);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_byte_perm (w[23], w[22], selector);
      w[62] = hc_byte_perm (w[22], w[21], selector);
      w[61] = hc_byte_perm (w[21], w[20], selector);
      w[60] = hc_byte_perm (w[20], w[19], selector);
      w[59] = hc_byte_perm (w[19], w[18], selector);
      w[58] = hc_byte_perm (w[18], w[17], selector);
      w[57] = hc_byte_perm (w[17], w[16], selector);
      w[56] = hc_byte_perm (w[16], w[15], selector);
      w[55] = hc_byte_perm (w[15], w[14], selector);
      w[54] = hc_byte_perm (w[14], w[13], selector);
      w[53] = hc_byte_perm (w[13], w[12], selector);
      w[52] = hc_byte_perm (w[12], w[11], selector);
      w[51] = hc_byte_perm (w[11], w[10], selector);
      w[50] = hc_byte_perm (w[10], w[ 9], selector);
      w[49] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[48] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[47] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[46] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[45] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[44] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[43] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[42] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[41] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[40] = hc_byte_perm (w[ 0],     0, selector);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_byte_perm (w[22], w[21], selector);
      w[62] = hc_byte_perm (w[21], w[20], selector);
      w[61] = hc_byte_perm (w[20], w[19], selector);
      w[60] = hc_byte_perm (w[19], w[18], selector);
      w[59] = hc_byte_perm (w[18], w[17], selector);
      w[58] = hc_byte_perm (w[17], w[16], selector);
      w[57] = hc_byte_perm (w[16], w[15], selector);
      w[56] = hc_byte_perm (w[15], w[14], selector);
      w[55] = hc_byte_perm (w[14], w[13], selector);
      w[54] = hc_byte_perm (w[13], w[12], selector);
      w[53] = hc_byte_perm (w[12], w[11], selector);
      w[52] = hc_byte_perm (w[11], w[10], selector);
      w[51] = hc_byte_perm (w[10], w[ 9], selector);
      w[50] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[49] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[48] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[47] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[46] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[45] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[44] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[43] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[42] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[41] = hc_byte_perm (w[ 0],     0, selector);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_byte_perm (w[21], w[20], selector);
      w[62] = hc_byte_perm (w[20], w[19], selector);
      w[61] = hc_byte_perm (w[19], w[18], selector);
      w[60] = hc_byte_perm (w[18], w[17], selector);
      w[59] = hc_byte_perm (w[17], w[16], selector);
      w[58] = hc_byte_perm (w[16], w[15], selector);
      w[57] = hc_byte_perm (w[15], w[14], selector);
      w[56] = hc_byte_perm (w[14], w[13], selector);
      w[55] = hc_byte_perm (w[13], w[12], selector);
      w[54] = hc_byte_perm (w[12], w[11], selector);
      w[53] = hc_byte_perm (w[11], w[10], selector);
      w[52] = hc_byte_perm (w[10], w[ 9], selector);
      w[51] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[50] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[49] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[48] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[47] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[46] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[45] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[44] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[43] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[42] = hc_byte_perm (w[ 0],     0, selector);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_byte_perm (w[20], w[19], selector);
      w[62] = hc_byte_perm (w[19], w[18], selector);
      w[61] = hc_byte_perm (w[18], w[17], selector);
      w[60] = hc_byte_perm (w[17], w[16], selector);
      w[59] = hc_byte_perm (w[16], w[15], selector);
      w[58] = hc_byte_perm (w[15], w[14], selector);
      w[57] = hc_byte_perm (w[14], w[13], selector);
      w[56] = hc_byte_perm (w[13], w[12], selector);
      w[55] = hc_byte_perm (w[12], w[11], selector);
      w[54] = hc_byte_perm (w[11], w[10], selector);
      w[53] = hc_byte_perm (w[10], w[ 9], selector);
      w[52] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[51] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[50] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[49] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[48] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[47] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[46] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[45] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[44] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[43] = hc_byte_perm (w[ 0],     0, selector);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_byte_perm (w[19], w[18], selector);
      w[62] = hc_byte_perm (w[18], w[17], selector);
      w[61] = hc_byte_perm (w[17], w[16], selector);
      w[60] = hc_byte_perm (w[16], w[15], selector);
      w[59] = hc_byte_perm (w[15], w[14], selector);
      w[58] = hc_byte_perm (w[14], w[13], selector);
      w[57] = hc_byte_perm (w[13], w[12], selector);
      w[56] = hc_byte_perm (w[12], w[11], selector);
      w[55] = hc_byte_perm (w[11], w[10], selector);
      w[54] = hc_byte_perm (w[10], w[ 9], selector);
      w[53] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[52] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[51] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[50] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[49] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[48] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[47] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[46] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[45] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[44] = hc_byte_perm (w[ 0],     0, selector);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_byte_perm (w[18], w[17], selector);
      w[62] = hc_byte_perm (w[17], w[16], selector);
      w[61] = hc_byte_perm (w[16], w[15], selector);
      w[60] = hc_byte_perm (w[15], w[14], selector);
      w[59] = hc_byte_perm (w[14], w[13], selector);
      w[58] = hc_byte_perm (w[13], w[12], selector);
      w[57] = hc_byte_perm (w[12], w[11], selector);
      w[56] = hc_byte_perm (w[11], w[10], selector);
      w[55] = hc_byte_perm (w[10], w[ 9], selector);
      w[54] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[53] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[52] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[51] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[50] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[49] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[48] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[47] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[46] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[45] = hc_byte_perm (w[ 0],     0, selector);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_byte_perm (w[17], w[16], selector);
      w[62] = hc_byte_perm (w[16], w[15], selector);
      w[61] = hc_byte_perm (w[15], w[14], selector);
      w[60] = hc_byte_perm (w[14], w[13], selector);
      w[59] = hc_byte_perm (w[13], w[12], selector);
      w[58] = hc_byte_perm (w[12], w[11], selector);
      w[57] = hc_byte_perm (w[11], w[10], selector);
      w[56] = hc_byte_perm (w[10], w[ 9], selector);
      w[55] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[54] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[53] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[52] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[51] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[50] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[49] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[48] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[47] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[46] = hc_byte_perm (w[ 0],     0, selector);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_byte_perm (w[16], w[15], selector);
      w[62] = hc_byte_perm (w[15], w[14], selector);
      w[61] = hc_byte_perm (w[14], w[13], selector);
      w[60] = hc_byte_perm (w[13], w[12], selector);
      w[59] = hc_byte_perm (w[12], w[11], selector);
      w[58] = hc_byte_perm (w[11], w[10], selector);
      w[57] = hc_byte_perm (w[10], w[ 9], selector);
      w[56] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[55] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[54] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[53] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[52] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[51] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[50] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[49] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[48] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[47] = hc_byte_perm (w[ 0],     0, selector);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_byte_perm (w[15], w[14], selector);
      w[62] = hc_byte_perm (w[14], w[13], selector);
      w[61] = hc_byte_perm (w[13], w[12], selector);
      w[60] = hc_byte_perm (w[12], w[11], selector);
      w[59] = hc_byte_perm (w[11], w[10], selector);
      w[58] = hc_byte_perm (w[10], w[ 9], selector);
      w[57] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[56] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[55] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[54] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[53] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[52] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[51] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[50] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[49] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[48] = hc_byte_perm (w[ 0],     0, selector);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_byte_perm (w[14], w[13], selector);
      w[62] = hc_byte_perm (w[13], w[12], selector);
      w[61] = hc_byte_perm (w[12], w[11], selector);
      w[60] = hc_byte_perm (w[11], w[10], selector);
      w[59] = hc_byte_perm (w[10], w[ 9], selector);
      w[58] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[57] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[56] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[55] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[54] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[53] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[52] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[51] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[50] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[49] = hc_byte_perm (w[ 0],     0, selector);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_byte_perm (w[13], w[12], selector);
      w[62] = hc_byte_perm (w[12], w[11], selector);
      w[61] = hc_byte_perm (w[11], w[10], selector);
      w[60] = hc_byte_perm (w[10], w[ 9], selector);
      w[59] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[58] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[57] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[56] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[55] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[54] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[53] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[52] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[51] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[50] = hc_byte_perm (w[ 0],     0, selector);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_byte_perm (w[12], w[11], selector);
      w[62] = hc_byte_perm (w[11], w[10], selector);
      w[61] = hc_byte_perm (w[10], w[ 9], selector);
      w[60] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[59] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[58] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[57] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[56] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[55] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[54] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[53] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[52] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[51] = hc_byte_perm (w[ 0],     0, selector);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_byte_perm (w[11], w[10], selector);
      w[62] = hc_byte_perm (w[10], w[ 9], selector);
      w[61] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[60] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[59] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[58] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[57] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[56] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[55] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[54] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[53] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[52] = hc_byte_perm (w[ 0],     0, selector);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_byte_perm (w[10], w[ 9], selector);
      w[62] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[61] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[60] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[59] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[58] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[57] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[56] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[55] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[54] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[53] = hc_byte_perm (w[ 0],     0, selector);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_byte_perm (w[ 9], w[ 8], selector);
      w[62] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[61] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[60] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[59] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[58] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[57] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[56] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[55] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[54] = hc_byte_perm (w[ 0],     0, selector);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_byte_perm (w[ 8], w[ 7], selector);
      w[62] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[61] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[60] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[59] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[58] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[57] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[56] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[55] = hc_byte_perm (w[ 0],     0, selector);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_byte_perm (w[ 7], w[ 6], selector);
      w[62] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[61] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[60] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[59] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[58] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[57] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[56] = hc_byte_perm (w[ 0],     0, selector);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_byte_perm (w[ 6], w[ 5], selector);
      w[62] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[61] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[60] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[59] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[58] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[57] = hc_byte_perm (w[ 0],     0, selector);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_byte_perm (w[ 5], w[ 4], selector);
      w[62] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[61] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[60] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[59] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[58] = hc_byte_perm (w[ 0],     0, selector);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_byte_perm (w[ 4], w[ 3], selector);
      w[62] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[61] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[60] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[59] = hc_byte_perm (w[ 0],     0, selector);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_byte_perm (w[ 3], w[ 2], selector);
      w[62] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[61] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[60] = hc_byte_perm (w[ 0],     0, selector);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_byte_perm (w[ 2], w[ 1], selector);
      w[62] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[61] = hc_byte_perm (w[ 0],     0, selector);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_byte_perm (w[ 1], w[ 0], selector);
      w[62] = hc_byte_perm (w[ 0],     0, selector);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_byte_perm (w[ 0],     0, selector);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif
}

/**
 * vector functions as scalar (for outer loop usage)
 */

DECLSPEC void truncate_block_4x4_le_S (u32 *w0, const u32 len)
{
  switch (len)
  {
    case  0:
      w0[0]  = 0;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  1:
      w0[0] &= 0x000000ff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  2:
      w0[0] &= 0x0000ffff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  3:
      w0[0] &= 0x00ffffff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  4:
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  5:
      w0[1] &= 0x000000ff;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  6:
      w0[1] &= 0x0000ffff;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  7:
      w0[1] &= 0x00ffffff;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  8:
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  9:
      w0[2] &= 0x000000ff;
      w0[3]  = 0;

      break;

    case 10:
      w0[2] &= 0x0000ffff;
      w0[3]  = 0;

      break;

    case 11:
      w0[2] &= 0x00ffffff;
      w0[3]  = 0;

      break;

    case 12:
      w0[3]  = 0;

      break;

    case 13:
      w0[3] &= 0x000000ff;

      break;

    case 14:
      w0[3] &= 0x0000ffff;

      break;

    case 15:
      w0[3] &= 0x00ffffff;

      break;
  }
}

DECLSPEC void truncate_block_4x4_be_S (u32 *w0, const u32 len)
{
  switch (len)
  {
    case  0:
      w0[0]  = 0;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  1:
      w0[0] &= 0xff000000;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  2:
      w0[0] &= 0xffff0000;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  3:
      w0[0] &= 0xffffff00;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  4:
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  5:
      w0[1] &= 0xff000000;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  6:
      w0[1] &= 0xffff0000;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  7:
      w0[1] &= 0xffffff00;
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  8:
      w0[2]  = 0;
      w0[3]  = 0;

      break;

    case  9:
      w0[2] &= 0xff000000;
      w0[3]  = 0;

      break;

    case 10:
      w0[2] &= 0xffff0000;
      w0[3]  = 0;

      break;

    case 11:
      w0[2] &= 0xffffff00;
      w0[3]  = 0;

      break;

    case 12:
      w0[3]  = 0;

      break;

    case 13:
      w0[3] &= 0xff000000;

      break;

    case 14:
      w0[3] &= 0xffff0000;

      break;

    case 15:
      w0[3] &= 0xffffff00;

      break;
  }
}

DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len)
{
  switch (len)
  {
    case  0:
      w0[0]  = 0;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  1:
      w0[0] &= 0x000000ff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  2:
      w0[0] &= 0x0000ffff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  3:
      w0[0] &= 0x00ffffff;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  4:
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  5:
      w0[1] &= 0x000000ff;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  6:
      w0[1] &= 0x0000ffff;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  7:
      w0[1] &= 0x00ffffff;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  8:
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  9:
      w0[2] &= 0x000000ff;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 10:
      w0[2] &= 0x0000ffff;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 11:
      w0[2] &= 0x00ffffff;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 12:
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 13:
      w0[3] &= 0x000000ff;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 14:
      w0[3] &= 0x0000ffff;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 15:
      w0[3] &= 0x00ffffff;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 16:
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 17:
      w1[0] &= 0x000000ff;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 18:
      w1[0] &= 0x0000ffff;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 19:
      w1[0] &= 0x00ffffff;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 20:
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 21:
      w1[1] &= 0x000000ff;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 22:
      w1[1] &= 0x0000ffff;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 23:
      w1[1] &= 0x00ffffff;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 24:
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 25:
      w1[2] &= 0x000000ff;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 26:
      w1[2] &= 0x0000ffff;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 27:
      w1[2] &= 0x00ffffff;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 28:
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 29:
      w1[3] &= 0x000000ff;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 30:
      w1[3] &= 0x0000ffff;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 31:
      w1[3] &= 0x00ffffff;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 32:
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 33:
      w2[0] &= 0x000000ff;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 34:
      w2[0] &= 0x0000ffff;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 35:
      w2[0] &= 0x00ffffff;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 36:
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 37:
      w2[1] &= 0x000000ff;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 38:
      w2[1] &= 0x0000ffff;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 39:
      w2[1] &= 0x00ffffff;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 40:
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 41:
      w2[2] &= 0x000000ff;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 42:
      w2[2] &= 0x0000ffff;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 43:
      w2[2] &= 0x00ffffff;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 44:
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 45:
      w2[3] &= 0x000000ff;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 46:
      w2[3] &= 0x0000ffff;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 47:
      w2[3] &= 0x00ffffff;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 48:
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 49:
      w3[0] &= 0x000000ff;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 50:
      w3[0] &= 0x0000ffff;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 51:
      w3[0] &= 0x00ffffff;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 52:
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 53:
      w3[1] &= 0x000000ff;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 54:
      w3[1] &= 0x0000ffff;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 55:
      w3[1] &= 0x00ffffff;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 56:
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 57:
      w3[2] &= 0x000000ff;
      w3[3]  = 0;

      break;

    case 58:
      w3[2] &= 0x0000ffff;
      w3[3]  = 0;

      break;

    case 59:
      w3[2] &= 0x00ffffff;
      w3[3]  = 0;

      break;

    case 60:
      w3[3]  = 0;

      break;

    case 61:
      w3[3] &= 0x000000ff;

      break;

    case 62:
      w3[3] &= 0x0000ffff;

      break;

    case 63:
      w3[3] &= 0x00ffffff;

      break;
  }
}

DECLSPEC void truncate_block_16x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len)
{
  switch (len)
  {
    case  0:
      w0[0]  = 0;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  1:
      w0[0] &= 0xff000000;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  2:
      w0[0] &= 0xffff0000;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  3:
      w0[0] &= 0xffffff00;
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  4:
      w0[1]  = 0;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  5:
      w0[1] &= 0xff000000;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  6:
      w0[1] &= 0xffff0000;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  7:
      w0[1] &= 0xffffff00;
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  8:
      w0[2]  = 0;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case  9:
      w0[2] &= 0xff000000;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 10:
      w0[2] &= 0xffff0000;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 11:
      w0[2] &= 0xffffff00;
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 12:
      w0[3]  = 0;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 13:
      w0[3] &= 0xff000000;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 14:
      w0[3] &= 0xffff0000;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 15:
      w0[3] &= 0xffffff00;
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 16:
      w1[0]  = 0;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 17:
      w1[0] &= 0xff000000;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 18:
      w1[0] &= 0xffff0000;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 19:
      w1[0] &= 0xffffff00;
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 20:
      w1[1]  = 0;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 21:
      w1[1] &= 0xff000000;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 22:
      w1[1] &= 0xffff0000;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 23:
      w1[1] &= 0xffffff00;
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 24:
      w1[2]  = 0;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 25:
      w1[2] &= 0xff000000;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 26:
      w1[2] &= 0xffff0000;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 27:
      w1[2] &= 0xffffff00;
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 28:
      w1[3]  = 0;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 29:
      w1[3] &= 0xff000000;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 30:
      w1[3] &= 0xffff0000;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 31:
      w1[3] &= 0xffffff00;
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 32:
      w2[0]  = 0;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 33:
      w2[0] &= 0xff000000;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 34:
      w2[0] &= 0xffff0000;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 35:
      w2[0] &= 0xffffff00;
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 36:
      w2[1]  = 0;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 37:
      w2[1] &= 0xff000000;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 38:
      w2[1] &= 0xffff0000;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 39:
      w2[1] &= 0xffffff00;
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 40:
      w2[2]  = 0;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 41:
      w2[2] &= 0xff000000;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 42:
      w2[2] &= 0xffff0000;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 43:
      w2[2] &= 0xffffff00;
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 44:
      w2[3]  = 0;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 45:
      w2[3] &= 0xff000000;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 46:
      w2[3] &= 0xffff0000;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 47:
      w2[3] &= 0xffffff00;
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 48:
      w3[0]  = 0;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 49:
      w3[0] &= 0xff000000;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 50:
      w3[0] &= 0xffff0000;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 51:
      w3[0] &= 0xffffff00;
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 52:
      w3[1]  = 0;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 53:
      w3[1] &= 0xff000000;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 54:
      w3[1] &= 0xffff0000;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 55:
      w3[1] &= 0xffffff00;
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 56:
      w3[2]  = 0;
      w3[3]  = 0;

      break;

    case 57:
      w3[2] &= 0xff000000;
      w3[3]  = 0;

      break;

    case 58:
      w3[2] &= 0xffff0000;
      w3[3]  = 0;

      break;

    case 59:
      w3[2] &= 0xffffff00;
      w3[3]  = 0;

      break;

    case 60:
      w3[3]  = 0;

      break;

    case 61:
      w3[3] &= 0xff000000;

      break;

    case 62:
      w3[3] &= 0xffff0000;

      break;

    case 63:
      w3[3] &= 0xffffff00;

      break;
  }
}

DECLSPEC void set_mark_1x4_S (u32 *v, const u32 offset)
{
  const u32 c = (offset & 15) / 4;
  const u32 r = 0xff << ((offset & 3) * 8);

  v[0] = (c == 0) ? r : 0;
  v[1] = (c == 1) ? r : 0;
  v[2] = (c == 2) ? r : 0;
  v[3] = (c == 3) ? r : 0;
}

DECLSPEC void append_helper_1x4_S (u32 *r, const u32 v, const u32 *m)
{
  r[0] |= v & m[0];
  r[1] |= v & m[1];
  r[2] |= v & m[2];
  r[3] |= v & m[3];
}

DECLSPEC void append_0x01_2x4_S (u32 *w0, u32 *w1, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v);
}

DECLSPEC void append_0x06_2x4_S (u32 *w0, u32 *w1, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x06060606 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x06060606 : 0), v);
}

DECLSPEC void append_0x01_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v);
  append_helper_1x4_S (w2, ((offset16 == 2) ? 0x01010101 : 0), v);
  append_helper_1x4_S (w3, ((offset16 == 3) ? 0x01010101 : 0), v);
}

DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  append_helper_1x4_S (w0, 0x80808080, v);
}

DECLSPEC void append_0x80_2x4_S (u32 *w0, u32 *w1, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_3x4_S (u32 *w0, u32 *w1, u32 *w2, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_8x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset)
{
  u32 v[4];

  set_mark_1x4_S (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w4, ((offset16 == 4) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w5, ((offset16 == 5) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w6, ((offset16 == 6) ? 0x80808080 : 0), v);
  append_helper_1x4_S (w7, ((offset16 == 7) ? 0x80808080 : 0), v);
}

DECLSPEC void make_utf16be_S (const u32 *in, u32 *out1, u32 *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm_S (in[3], 0, 0x3727);
  out2[2] = hc_byte_perm_S (in[3], 0, 0x1707);
  out2[1] = hc_byte_perm_S (in[2], 0, 0x3727);
  out2[0] = hc_byte_perm_S (in[2], 0, 0x1707);
  out1[3] = hc_byte_perm_S (in[1], 0, 0x3727);
  out1[2] = hc_byte_perm_S (in[1], 0, 0x1707);
  out1[1] = hc_byte_perm_S (in[0], 0, 0x3727);
  out1[0] = hc_byte_perm_S (in[0], 0, 0x1707);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm_S (in[3], 0, 0x03070207);
  out2[2] = hc_byte_perm_S (in[3], 0, 0x01070007);
  out2[1] = hc_byte_perm_S (in[2], 0, 0x03070207);
  out2[0] = hc_byte_perm_S (in[2], 0, 0x01070007);
  out1[3] = hc_byte_perm_S (in[1], 0, 0x03070207);
  out1[2] = hc_byte_perm_S (in[1], 0, 0x01070007);
  out1[1] = hc_byte_perm_S (in[0], 0, 0x03070207);
  out1[0] = hc_byte_perm_S (in[0], 0, 0x01070007);

  #else

  out2[3] = ((in[3] >>  0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00);
  out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00);
  out2[1] = ((in[2] >>  0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00);
  out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00);
  out1[3] = ((in[1] >>  0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00);
  out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00);
  out1[1] = ((in[0] >>  0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00);
  out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00);

  #endif
}

DECLSPEC void make_utf16le_S (const u32 *in, u32 *out1, u32 *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm_S (in[3], 0, 0x7372);
  out2[2] = hc_byte_perm_S (in[3], 0, 0x7170);
  out2[1] = hc_byte_perm_S (in[2], 0, 0x7372);
  out2[0] = hc_byte_perm_S (in[2], 0, 0x7170);
  out1[3] = hc_byte_perm_S (in[1], 0, 0x7372);
  out1[2] = hc_byte_perm_S (in[1], 0, 0x7170);
  out1[1] = hc_byte_perm_S (in[0], 0, 0x7372);
  out1[0] = hc_byte_perm_S (in[0], 0, 0x7170);

  #elif defined IS_AMD && AMD_GCN >= 3

  out2[3] = hc_byte_perm_S (in[3], 0, 0x07030702);
  out2[2] = hc_byte_perm_S (in[3], 0, 0x07010700);
  out2[1] = hc_byte_perm_S (in[2], 0, 0x07030702);
  out2[0] = hc_byte_perm_S (in[2], 0, 0x07010700);
  out1[3] = hc_byte_perm_S (in[1], 0, 0x07030702);
  out1[2] = hc_byte_perm_S (in[1], 0, 0x07010700);
  out1[1] = hc_byte_perm_S (in[0], 0, 0x07030702);
  out1[0] = hc_byte_perm_S (in[0], 0, 0x07010700);

  #else

  out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
  out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >>  0) & 0x000000FF);
  out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
  out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >>  0) & 0x000000FF);
  out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
  out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >>  0) & 0x000000FF);
  out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
  out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >>  0) & 0x000000FF);

  #endif
}

DECLSPEC void undo_utf16be_S (const u32 *in1, const u32 *in2, u32 *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm_S (in1[0], in1[1], 0x4602);
  out[1] = hc_byte_perm_S (in1[2], in1[3], 0x4602);
  out[2] = hc_byte_perm_S (in2[0], in2[1], 0x4602);
  out[3] = hc_byte_perm_S (in2[2], in2[3], 0x4602);

  #elif defined IS_AMD && AMD_GCN >= 3

  out[0] = hc_byte_perm_S (in1[0], in1[1], 0x04060002);
  out[1] = hc_byte_perm_S (in1[2], in1[3], 0x04060002);
  out[2] = hc_byte_perm_S (in2[0], in2[1], 0x04060002);
  out[3] = hc_byte_perm_S (in2[2], in2[3], 0x04060002);

  #else

  out[0] = ((in1[0] & 0x0000ff00) >>  8) | ((in1[0] & 0xff000000) >> 16)
         | ((in1[1] & 0x0000ff00) <<  8) | ((in1[1] & 0xff000000) <<  0);
  out[1] = ((in1[2] & 0x0000ff00) >>  8) | ((in1[2] & 0xff000000) >> 16)
         | ((in1[3] & 0x0000ff00) <<  8) | ((in1[3] & 0xff000000) <<  0);
  out[2] = ((in2[0] & 0x0000ff00) >>  8) | ((in2[0] & 0xff000000) >> 16)
         | ((in2[1] & 0x0000ff00) <<  8) | ((in2[1] & 0xff000000) <<  0);
  out[3] = ((in2[2] & 0x0000ff00) >>  8) | ((in2[2] & 0xff000000) >> 16)
         | ((in2[3] & 0x0000ff00) <<  8) | ((in2[3] & 0xff000000) <<  0);

  #endif
}

DECLSPEC void undo_utf16le_S (const u32 *in1, const u32 *in2, u32 *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm_S (in1[0], in1[1], 0x6420);
  out[1] = hc_byte_perm_S (in1[2], in1[3], 0x6420);
  out[2] = hc_byte_perm_S (in2[0], in2[1], 0x6420);
  out[3] = hc_byte_perm_S (in2[2], in2[3], 0x6420);

  #elif defined IS_AMD && AMD_GCN >= 3

  out[0] = hc_byte_perm_S (in1[0], in1[1], 0x06040200);
  out[1] = hc_byte_perm_S (in1[2], in1[3], 0x06040200);
  out[2] = hc_byte_perm_S (in2[0], in2[1], 0x06040200);
  out[3] = hc_byte_perm_S (in2[2], in2[3], 0x06040200);

  #else

  out[0] = ((in1[0] & 0x000000ff) >>  0) | ((in1[0] & 0x00ff0000) >>  8)
         | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) <<  8);
  out[1] = ((in1[2] & 0x000000ff) >>  0) | ((in1[2] & 0x00ff0000) >>  8)
         | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) <<  8);
  out[2] = ((in2[0] & 0x000000ff) >>  0) | ((in2[0] & 0x00ff0000) >>  8)
         | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) <<  8);
  out[3] = ((in2[2] & 0x000000ff) >>  0) | ((in2[2] & 0x00ff0000) >>  8)
         | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) <<  8);

  #endif
}

DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm_S (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm_S (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm_S (    0, w0[0], selector);

      break;

    case  1:
      w3[3] = hc_byte_perm_S (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm_S (    0, w0[0], selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm_S (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm_S (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm_S (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm_S (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm_S (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm_S (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm_S (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm_S (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm_S (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm_S (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm_S (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm_S (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm_S (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm_S (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if defined IS_AMD || defined IS_GENERIC
  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign_S (w3[3],     0, offset);
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign_S (w3[3],     0, offset);
      c0[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign_S (w3[3],     0, offset);
      c0[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign_S (w3[3],     0, offset);
      c0[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign_S (w3[3],     0, offset);
      c0[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign_S (w3[3],     0, offset);
      c1[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign_S (w3[3],     0, offset);
      c1[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign_S (w3[3],     0, offset);
      c1[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign_S (w3[3],     0, offset);
      c1[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign_S (w3[3],     0, offset);
      c2[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign_S (w3[3],     0, offset);
      c2[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign_S (w3[3],     0, offset);
      c2[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign_S (w3[3],     0, offset);
      c2[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[0] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[3] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[2] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[1] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign_S (w3[3],     0, offset);
      c3[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[0] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[3] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[2] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[1] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign_S (w3[3],     0, offset);
      c3[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[0] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[3] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[2] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[1] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign_S (w3[3],     0, offset);
      c3[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c3[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c2[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c1[0] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[3] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[2] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[1] = hc_bytealign_S (w0[1], w0[2], offset);
      c0[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);
  c0[0] = swap32_S (c0[0]);
  c0[1] = swap32_S (c0[1]);
  c0[2] = swap32_S (c0[2]);
  c0[3] = swap32_S (c0[3]);
  c1[0] = swap32_S (c1[0]);
  c1[1] = swap32_S (c1[1]);
  c1[2] = swap32_S (c1[2]);
  c1[3] = swap32_S (c1[3]);
  c2[0] = swap32_S (c2[0]);
  c2[1] = swap32_S (c2[1]);
  c2[2] = swap32_S (c2[2]);
  c2[3] = swap32_S (c2[3]);
  c3[0] = swap32_S (c3[0]);
  c3[1] = swap32_S (c3[1]);
  c3[2] = swap32_S (c3[2]);
  c3[3] = swap32_S (c3[3]);
  #endif

  #ifdef IS_NV
  // todo
  switch (offset_switch)
  {
    case 0:
      c0[0] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      w3[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      w3[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      w3[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      w3[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      w2[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w2[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w2[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w2[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w1[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w1[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w1[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w1[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w0[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w0[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w0[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w0[0] = hc_bytealign_S (w0[0],     0, offset_minus_4);

      if (offset_mod_4 == 0)
      {
        w0[0] = w0[1];
        w0[1] = w0[2];
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = 0;
      }

      break;

    case 1:
      c0[1] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c0[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      w3[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      w3[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      w3[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      w3[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w2[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w2[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w2[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w2[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w1[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w1[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w1[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w1[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w0[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w0[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w0[1] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[1] = w0[2];
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = 0;
      }

      break;

    case 2:
      c0[2] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c0[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c0[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      w3[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      w3[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      w3[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w3[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w2[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w2[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w2[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w2[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w1[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w1[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w1[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w1[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w0[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w0[2] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[2] = w0[3];
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = 0;
      }

      break;

    case 3:
      c0[3] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c0[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c0[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c0[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      w3[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      w3[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w3[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w3[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w2[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w2[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w2[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w2[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w1[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w1[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w1[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w1[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w0[3] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w0[3] = w1[0];
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = 0;
      }

      break;

    case 4:
      c1[0] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c0[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c0[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c0[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c0[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      w3[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w3[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w3[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w3[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w2[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w2[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w2[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w2[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w1[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w1[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w1[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w1[0] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[0] = w1[1];
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = 0;
      }

      break;

    case 5:
      c1[1] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c1[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c0[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c0[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c0[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c0[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      w3[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w3[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w3[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w3[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w2[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w2[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w2[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w2[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w1[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w1[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w1[1] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[1] = w1[2];
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = 0;
      }

      break;

    case 6:
      c1[2] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c1[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c1[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c0[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c0[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c0[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c0[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      w3[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w3[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w3[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w3[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w2[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w2[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w2[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w2[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w1[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w1[2] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[2] = w1[3];
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = 0;
      }

      break;

    case 7:
      c1[3] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c1[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c1[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c1[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c0[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c0[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c0[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c0[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      w3[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w3[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w3[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w3[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w2[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w2[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w2[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w2[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w1[3] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w1[3] = w2[0];
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = 0;
      }

      break;

    case 8:
      c2[0] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c1[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c1[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c1[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c1[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c0[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c0[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c0[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c0[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      w3[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w3[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w3[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w3[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w2[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w2[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w2[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w2[0] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[0] = w2[1];
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = 0;
      }

      break;

    case 9:
      c2[1] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c2[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c1[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c1[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c1[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c1[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c0[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c0[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c0[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c0[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      w3[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w3[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w3[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w3[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w2[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w2[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w2[1] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[1] = w2[2];
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = 0;
      }

      break;

    case 10:
      c2[2] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c2[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c2[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c1[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c1[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c1[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c1[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c0[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c0[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c0[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c0[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      w3[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w3[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w3[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w3[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w2[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w2[2] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[2] = w2[3];
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = 0;
      }

      break;

    case 11:
      c2[3] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c2[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c2[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c2[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c1[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c1[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c1[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c1[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c0[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c0[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c0[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      c0[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      w3[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w3[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w3[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w3[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w2[3] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w2[3] = w3[0];
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = 0;
      }

      break;

    case 12:
      c3[0] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c2[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c2[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c2[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c2[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c1[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c1[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c1[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c1[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c0[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c0[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      c0[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      c0[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      w3[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w3[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w3[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w3[0] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[0] = w3[1];
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = 0;
      }

      break;

    case 13:
      c3[1] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c3[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c2[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c2[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c2[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c2[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c1[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c1[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c1[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c1[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c0[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      c0[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      c0[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      c0[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      w3[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w3[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w3[1] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[1] = w3[2];
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = 0;
      }

      break;

    case 14:
      c3[2] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c3[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c3[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c2[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c2[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c2[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c2[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c1[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c1[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c1[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c1[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      c0[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      c0[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      c0[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      c0[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      w3[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w3[2] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[2] = w3[3];
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = c3[2];
        c3[2] = 0;
      }

      break;

    case 15:
      c3[3] = hc_bytealign_S (    0, w3[3], offset_minus_4);
      c3[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4);
      c3[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4);
      c3[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4);
      c2[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4);
      c2[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4);
      c2[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4);
      c2[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4);
      c1[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4);
      c1[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4);
      c1[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4);
      c1[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4);
      c0[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4);
      c0[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4);
      c0[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4);
      c0[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4);
      w3[3] = hc_bytealign_S (w0[0],     0, offset_minus_4);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      if (offset_mod_4 == 0)
      {
        w3[3] = c0[0];
        c0[0] = c0[1];
        c0[1] = c0[2];
        c0[2] = c0[3];
        c0[3] = c1[0];
        c1[0] = c1[1];
        c1[1] = c1[2];
        c1[2] = c1[3];
        c1[3] = c2[0];
        c2[0] = c2[1];
        c2[1] = c2[2];
        c2[2] = c2[3];
        c2[3] = c3[0];
        c3[0] = c3[1];
        c3[1] = c3[2];
        c3[2] = c3[3];
        c3[3] = 0;
      }

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm_S (w0[0],     0, selector);

      break;

    case  1:
      w3[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm_S (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm_S (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm_S (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm_S (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm_S (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm_S (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm_S (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm_S (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm_S (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm_S (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm_S (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm_S (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm_S (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm_S (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm_S (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign_S (w3[3],     0, offset);
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign_S (w3[3],     0, offset);
      c0[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign_S (w3[3],     0, offset);
      c0[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign_S (w3[3],     0, offset);
      c0[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign_S (w3[3],     0, offset);
      c0[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign_S (w3[3],     0, offset);
      c1[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign_S (w3[3],     0, offset);
      c1[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign_S (w3[3],     0, offset);
      c1[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign_S (w3[3],     0, offset);
      c1[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign_S (w3[3],     0, offset);
      c2[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign_S (w3[3],     0, offset);
      c2[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign_S (w3[3],     0, offset);
      c2[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign_S (w3[3],     0, offset);
      c2[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[0] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[3] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[2] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[1] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign_S (w3[3],     0, offset);
      c3[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[0] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[3] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[2] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[1] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign_S (w3[3],     0, offset);
      c3[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[0] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[3] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[2] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[1] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign_S (w3[3],     0, offset);
      c3[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c3[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c2[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c1[0] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[3] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[2] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[1] = hc_bytealign_S (w0[1], w0[2], offset);
      c0[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm_S (    0, w3[3], selector);
      w3[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm_S (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm_S (    0, w3[3], selector);
      c0[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm_S (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm_S (    0, w3[3], selector);
      c0[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm_S (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm_S (    0, w3[3], selector);
      c0[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm_S (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm_S (    0, w3[3], selector);
      c0[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm_S (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm_S (    0, w3[3], selector);
      c1[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm_S (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm_S (    0, w3[3], selector);
      c1[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm_S (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm_S (    0, w3[3], selector);
      c1[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm_S (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm_S (    0, w3[3], selector);
      c1[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm_S (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm_S (    0, w3[3], selector);
      c2[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm_S (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm_S (    0, w3[3], selector);
      c2[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm_S (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm_S (    0, w3[3], selector);
      c2[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm_S (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm_S (    0, w3[3], selector);
      c2[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm_S (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm_S (    0, w3[3], selector);
      c3[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm_S (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm_S (    0, w3[3], selector);
      c3[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm_S (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm_S (    0, w3[3], selector);
      c3[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm_S (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);
  w4[0] = swap32_S (w4[0]);
  w4[1] = swap32_S (w4[1]);
  w4[2] = swap32_S (w4[2]);
  w4[3] = swap32_S (w4[3]);
  w5[0] = swap32_S (w5[0]);
  w5[1] = swap32_S (w5[1]);
  w5[2] = swap32_S (w5[2]);
  w5[3] = swap32_S (w5[3]);
  w6[0] = swap32_S (w6[0]);
  w6[1] = swap32_S (w6[1]);
  w6[2] = swap32_S (w6[2]);
  w6[3] = swap32_S (w6[3]);
  w7[0] = swap32_S (w7[0]);
  w7[1] = swap32_S (w7[1]);
  w7[2] = swap32_S (w7[2]);
  w7[3] = swap32_S (w7[3]);

  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign_S (w7[2], w7[3], offset);
      w7[2] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[1] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[0] = hc_bytealign_S (w6[3], w7[0], offset);
      w6[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w5[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w4[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[2] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[1] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[0] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[2] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[1] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[0] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[2] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[1] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[0] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[0] = hc_bytealign_S (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[1] = hc_bytealign_S (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[2] = hc_bytealign_S (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[3] = hc_bytealign_S (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[0] = hc_bytealign_S (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[1] = hc_bytealign_S (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[2] = hc_bytealign_S (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[3] = hc_bytealign_S (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[0] = hc_bytealign_S (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[1] = hc_bytealign_S (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[2] = hc_bytealign_S (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[3] = hc_bytealign_S (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[0] = hc_bytealign_S (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[1] = hc_bytealign_S (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[2] = hc_bytealign_S (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign_S (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }

  w0[0] = swap32_S (w0[0]);
  w0[1] = swap32_S (w0[1]);
  w0[2] = swap32_S (w0[2]);
  w0[3] = swap32_S (w0[3]);
  w1[0] = swap32_S (w1[0]);
  w1[1] = swap32_S (w1[1]);
  w1[2] = swap32_S (w1[2]);
  w1[3] = swap32_S (w1[3]);
  w2[0] = swap32_S (w2[0]);
  w2[1] = swap32_S (w2[1]);
  w2[2] = swap32_S (w2[2]);
  w2[3] = swap32_S (w2[3]);
  w3[0] = swap32_S (w3[0]);
  w3[1] = swap32_S (w3[1]);
  w3[2] = swap32_S (w3[2]);
  w3[3] = swap32_S (w3[3]);
  w4[0] = swap32_S (w4[0]);
  w4[1] = swap32_S (w4[1]);
  w4[2] = swap32_S (w4[2]);
  w4[3] = swap32_S (w4[3]);
  w5[0] = swap32_S (w5[0]);
  w5[1] = swap32_S (w5[1]);
  w5[2] = swap32_S (w5[2]);
  w5[3] = swap32_S (w5[3]);
  w6[0] = swap32_S (w6[0]);
  w6[1] = swap32_S (w6[1]);
  w6[2] = swap32_S (w6[2]);
  w6[3] = swap32_S (w6[3]);
  w7[0] = swap32_S (w7[0]);
  w7[1] = swap32_S (w7[1]);
  w7[2] = swap32_S (w7[2]);
  w7[3] = swap32_S (w7[3]);
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case 0:
      w7[3] = hc_byte_perm_S (w7[2], w7[3], selector);
      w7[2] = hc_byte_perm_S (w7[1], w7[2], selector);
      w7[1] = hc_byte_perm_S (w7[0], w7[1], selector);
      w7[0] = hc_byte_perm_S (w6[3], w7[0], selector);
      w6[3] = hc_byte_perm_S (w6[2], w6[3], selector);
      w6[2] = hc_byte_perm_S (w6[1], w6[2], selector);
      w6[1] = hc_byte_perm_S (w6[0], w6[1], selector);
      w6[0] = hc_byte_perm_S (w5[3], w6[0], selector);
      w5[3] = hc_byte_perm_S (w5[2], w5[3], selector);
      w5[2] = hc_byte_perm_S (w5[1], w5[2], selector);
      w5[1] = hc_byte_perm_S (w5[0], w5[1], selector);
      w5[0] = hc_byte_perm_S (w4[3], w5[0], selector);
      w4[3] = hc_byte_perm_S (w4[2], w4[3], selector);
      w4[2] = hc_byte_perm_S (w4[1], w4[2], selector);
      w4[1] = hc_byte_perm_S (w4[0], w4[1], selector);
      w4[0] = hc_byte_perm_S (w3[3], w4[0], selector);
      w3[3] = hc_byte_perm_S (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm_S (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm_S (    0, w0[0], selector);
      break;

    case 1:
      w7[3] = hc_byte_perm_S (w7[1], w7[2], selector);
      w7[2] = hc_byte_perm_S (w7[0], w7[1], selector);
      w7[1] = hc_byte_perm_S (w6[3], w7[0], selector);
      w7[0] = hc_byte_perm_S (w6[2], w6[3], selector);
      w6[3] = hc_byte_perm_S (w6[1], w6[2], selector);
      w6[2] = hc_byte_perm_S (w6[0], w6[1], selector);
      w6[1] = hc_byte_perm_S (w5[3], w6[0], selector);
      w6[0] = hc_byte_perm_S (w5[2], w5[3], selector);
      w5[3] = hc_byte_perm_S (w5[1], w5[2], selector);
      w5[2] = hc_byte_perm_S (w5[0], w5[1], selector);
      w5[1] = hc_byte_perm_S (w4[3], w5[0], selector);
      w5[0] = hc_byte_perm_S (w4[2], w4[3], selector);
      w4[3] = hc_byte_perm_S (w4[1], w4[2], selector);
      w4[2] = hc_byte_perm_S (w4[0], w4[1], selector);
      w4[1] = hc_byte_perm_S (w3[3], w4[0], selector);
      w4[0] = hc_byte_perm_S (w3[2], w3[3], selector);
      w3[3] = hc_byte_perm_S (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm_S (    0, w0[0], selector);
      w0[0] = 0;
      break;

    case 2:
      w7[3] = hc_byte_perm_S (w7[0], w7[1], selector);
      w7[2] = hc_byte_perm_S (w6[3], w7[0], selector);
      w7[1] = hc_byte_perm_S (w6[2], w6[3], selector);
      w7[0] = hc_byte_perm_S (w6[1], w6[2], selector);
      w6[3] = hc_byte_perm_S (w6[0], w6[1], selector);
      w6[2] = hc_byte_perm_S (w5[3], w6[0], selector);
      w6[1] = hc_byte_perm_S (w5[2], w5[3], selector);
      w6[0] = hc_byte_perm_S (w5[1], w5[2], selector);
      w5[3] = hc_byte_perm_S (w5[0], w5[1], selector);
      w5[2] = hc_byte_perm_S (w4[3], w5[0], selector);
      w5[1] = hc_byte_perm_S (w4[2], w4[3], selector);
      w5[0] = hc_byte_perm_S (w4[1], w4[2], selector);
      w4[3] = hc_byte_perm_S (w4[0], w4[1], selector);
      w4[2] = hc_byte_perm_S (w3[3], w4[0], selector);
      w4[1] = hc_byte_perm_S (w3[2], w3[3], selector);
      w4[0] = hc_byte_perm_S (w3[1], w3[2], selector);
      w3[3] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm_S (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 3:
      w7[3] = hc_byte_perm_S (w6[3], w7[0], selector);
      w7[2] = hc_byte_perm_S (w6[2], w6[3], selector);
      w7[1] = hc_byte_perm_S (w6[1], w6[2], selector);
      w7[0] = hc_byte_perm_S (w6[0], w6[1], selector);
      w6[3] = hc_byte_perm_S (w5[3], w6[0], selector);
      w6[2] = hc_byte_perm_S (w5[2], w5[3], selector);
      w6[1] = hc_byte_perm_S (w5[1], w5[2], selector);
      w6[0] = hc_byte_perm_S (w5[0], w5[1], selector);
      w5[3] = hc_byte_perm_S (w4[3], w5[0], selector);
      w5[2] = hc_byte_perm_S (w4[2], w4[3], selector);
      w5[1] = hc_byte_perm_S (w4[1], w4[2], selector);
      w5[0] = hc_byte_perm_S (w4[0], w4[1], selector);
      w4[3] = hc_byte_perm_S (w3[3], w4[0], selector);
      w4[2] = hc_byte_perm_S (w3[2], w3[3], selector);
      w4[1] = hc_byte_perm_S (w3[1], w3[2], selector);
      w4[0] = hc_byte_perm_S (w3[0], w3[1], selector);
      w3[3] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm_S (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 4:
      w7[3] = hc_byte_perm_S (w6[2], w6[3], selector);
      w7[2] = hc_byte_perm_S (w6[1], w6[2], selector);
      w7[1] = hc_byte_perm_S (w6[0], w6[1], selector);
      w7[0] = hc_byte_perm_S (w5[3], w6[0], selector);
      w6[3] = hc_byte_perm_S (w5[2], w5[3], selector);
      w6[2] = hc_byte_perm_S (w5[1], w5[2], selector);
      w6[1] = hc_byte_perm_S (w5[0], w5[1], selector);
      w6[0] = hc_byte_perm_S (w4[3], w5[0], selector);
      w5[3] = hc_byte_perm_S (w4[2], w4[3], selector);
      w5[2] = hc_byte_perm_S (w4[1], w4[2], selector);
      w5[1] = hc_byte_perm_S (w4[0], w4[1], selector);
      w5[0] = hc_byte_perm_S (w3[3], w4[0], selector);
      w4[3] = hc_byte_perm_S (w3[2], w3[3], selector);
      w4[2] = hc_byte_perm_S (w3[1], w3[2], selector);
      w4[1] = hc_byte_perm_S (w3[0], w3[1], selector);
      w4[0] = hc_byte_perm_S (w2[3], w3[0], selector);
      w3[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm_S (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 5:
      w7[3] = hc_byte_perm_S (w6[1], w6[2], selector);
      w7[2] = hc_byte_perm_S (w6[0], w6[1], selector);
      w7[1] = hc_byte_perm_S (w5[3], w6[0], selector);
      w7[0] = hc_byte_perm_S (w5[2], w5[3], selector);
      w6[3] = hc_byte_perm_S (w5[1], w5[2], selector);
      w6[2] = hc_byte_perm_S (w5[0], w5[1], selector);
      w6[1] = hc_byte_perm_S (w4[3], w5[0], selector);
      w6[0] = hc_byte_perm_S (w4[2], w4[3], selector);
      w5[3] = hc_byte_perm_S (w4[1], w4[2], selector);
      w5[2] = hc_byte_perm_S (w4[0], w4[1], selector);
      w5[1] = hc_byte_perm_S (w3[3], w4[0], selector);
      w5[0] = hc_byte_perm_S (w3[2], w3[3], selector);
      w4[3] = hc_byte_perm_S (w3[1], w3[2], selector);
      w4[2] = hc_byte_perm_S (w3[0], w3[1], selector);
      w4[1] = hc_byte_perm_S (w2[3], w3[0], selector);
      w4[0] = hc_byte_perm_S (w2[2], w2[3], selector);
      w3[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm_S (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 6:
      w7[3] = hc_byte_perm_S (w6[0], w6[1], selector);
      w7[2] = hc_byte_perm_S (w5[3], w6[0], selector);
      w7[1] = hc_byte_perm_S (w5[2], w5[3], selector);
      w7[0] = hc_byte_perm_S (w5[1], w5[2], selector);
      w6[3] = hc_byte_perm_S (w5[0], w5[1], selector);
      w6[2] = hc_byte_perm_S (w4[3], w5[0], selector);
      w6[1] = hc_byte_perm_S (w4[2], w4[3], selector);
      w6[0] = hc_byte_perm_S (w4[1], w4[2], selector);
      w5[3] = hc_byte_perm_S (w4[0], w4[1], selector);
      w5[2] = hc_byte_perm_S (w3[3], w4[0], selector);
      w5[1] = hc_byte_perm_S (w3[2], w3[3], selector);
      w5[0] = hc_byte_perm_S (w3[1], w3[2], selector);
      w4[3] = hc_byte_perm_S (w3[0], w3[1], selector);
      w4[2] = hc_byte_perm_S (w2[3], w3[0], selector);
      w4[1] = hc_byte_perm_S (w2[2], w2[3], selector);
      w4[0] = hc_byte_perm_S (w2[1], w2[2], selector);
      w3[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm_S (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 7:
      w7[3] = hc_byte_perm_S (w5[3], w6[0], selector);
      w7[2] = hc_byte_perm_S (w5[2], w5[3], selector);
      w7[1] = hc_byte_perm_S (w5[1], w5[2], selector);
      w7[0] = hc_byte_perm_S (w5[0], w5[1], selector);
      w6[3] = hc_byte_perm_S (w4[3], w5[0], selector);
      w6[2] = hc_byte_perm_S (w4[2], w4[3], selector);
      w6[1] = hc_byte_perm_S (w4[1], w4[2], selector);
      w6[0] = hc_byte_perm_S (w4[0], w4[1], selector);
      w5[3] = hc_byte_perm_S (w3[3], w4[0], selector);
      w5[2] = hc_byte_perm_S (w3[2], w3[3], selector);
      w5[1] = hc_byte_perm_S (w3[1], w3[2], selector);
      w5[0] = hc_byte_perm_S (w3[0], w3[1], selector);
      w4[3] = hc_byte_perm_S (w2[3], w3[0], selector);
      w4[2] = hc_byte_perm_S (w2[2], w2[3], selector);
      w4[1] = hc_byte_perm_S (w2[1], w2[2], selector);
      w4[0] = hc_byte_perm_S (w2[0], w2[1], selector);
      w3[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm_S (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 8:
      w7[3] = hc_byte_perm_S (w5[2], w5[3], selector);
      w7[2] = hc_byte_perm_S (w5[1], w5[2], selector);
      w7[1] = hc_byte_perm_S (w5[0], w5[1], selector);
      w7[0] = hc_byte_perm_S (w4[3], w5[0], selector);
      w6[3] = hc_byte_perm_S (w4[2], w4[3], selector);
      w6[2] = hc_byte_perm_S (w4[1], w4[2], selector);
      w6[1] = hc_byte_perm_S (w4[0], w4[1], selector);
      w6[0] = hc_byte_perm_S (w3[3], w4[0], selector);
      w5[3] = hc_byte_perm_S (w3[2], w3[3], selector);
      w5[2] = hc_byte_perm_S (w3[1], w3[2], selector);
      w5[1] = hc_byte_perm_S (w3[0], w3[1], selector);
      w5[0] = hc_byte_perm_S (w2[3], w3[0], selector);
      w4[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w4[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w4[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w4[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w3[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm_S (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 9:
      w7[3] = hc_byte_perm_S (w5[1], w5[2], selector);
      w7[2] = hc_byte_perm_S (w5[0], w5[1], selector);
      w7[1] = hc_byte_perm_S (w4[3], w5[0], selector);
      w7[0] = hc_byte_perm_S (w4[2], w4[3], selector);
      w6[3] = hc_byte_perm_S (w4[1], w4[2], selector);
      w6[2] = hc_byte_perm_S (w4[0], w4[1], selector);
      w6[1] = hc_byte_perm_S (w3[3], w4[0], selector);
      w6[0] = hc_byte_perm_S (w3[2], w3[3], selector);
      w5[3] = hc_byte_perm_S (w3[1], w3[2], selector);
      w5[2] = hc_byte_perm_S (w3[0], w3[1], selector);
      w5[1] = hc_byte_perm_S (w2[3], w3[0], selector);
      w5[0] = hc_byte_perm_S (w2[2], w2[3], selector);
      w4[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w4[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w4[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w4[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w3[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm_S (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 10:
      w7[3] = hc_byte_perm_S (w5[0], w5[1], selector);
      w7[2] = hc_byte_perm_S (w4[3], w5[0], selector);
      w7[1] = hc_byte_perm_S (w4[2], w4[3], selector);
      w7[0] = hc_byte_perm_S (w4[1], w4[2], selector);
      w6[3] = hc_byte_perm_S (w4[0], w4[1], selector);
      w6[2] = hc_byte_perm_S (w3[3], w4[0], selector);
      w6[1] = hc_byte_perm_S (w3[2], w3[3], selector);
      w6[0] = hc_byte_perm_S (w3[1], w3[2], selector);
      w5[3] = hc_byte_perm_S (w3[0], w3[1], selector);
      w5[2] = hc_byte_perm_S (w2[3], w3[0], selector);
      w5[1] = hc_byte_perm_S (w2[2], w2[3], selector);
      w5[0] = hc_byte_perm_S (w2[1], w2[2], selector);
      w4[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w4[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w4[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w4[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w3[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm_S (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 11:
      w7[3] = hc_byte_perm_S (w4[3], w5[0], selector);
      w7[2] = hc_byte_perm_S (w4[2], w4[3], selector);
      w7[1] = hc_byte_perm_S (w4[1], w4[2], selector);
      w7[0] = hc_byte_perm_S (w4[0], w4[1], selector);
      w6[3] = hc_byte_perm_S (w3[3], w4[0], selector);
      w6[2] = hc_byte_perm_S (w3[2], w3[3], selector);
      w6[1] = hc_byte_perm_S (w3[1], w3[2], selector);
      w6[0] = hc_byte_perm_S (w3[0], w3[1], selector);
      w5[3] = hc_byte_perm_S (w2[3], w3[0], selector);
      w5[2] = hc_byte_perm_S (w2[2], w2[3], selector);
      w5[1] = hc_byte_perm_S (w2[1], w2[2], selector);
      w5[0] = hc_byte_perm_S (w2[0], w2[1], selector);
      w4[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w4[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w4[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w4[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w3[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm_S (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 12:
      w7[3] = hc_byte_perm_S (w4[2], w4[3], selector);
      w7[2] = hc_byte_perm_S (w4[1], w4[2], selector);
      w7[1] = hc_byte_perm_S (w4[0], w4[1], selector);
      w7[0] = hc_byte_perm_S (w3[3], w4[0], selector);
      w6[3] = hc_byte_perm_S (w3[2], w3[3], selector);
      w6[2] = hc_byte_perm_S (w3[1], w3[2], selector);
      w6[1] = hc_byte_perm_S (w3[0], w3[1], selector);
      w6[0] = hc_byte_perm_S (w2[3], w3[0], selector);
      w5[3] = hc_byte_perm_S (w2[2], w2[3], selector);
      w5[2] = hc_byte_perm_S (w2[1], w2[2], selector);
      w5[1] = hc_byte_perm_S (w2[0], w2[1], selector);
      w5[0] = hc_byte_perm_S (w1[3], w2[0], selector);
      w4[3] = hc_byte_perm_S (w1[2], w1[3], selector);
      w4[2] = hc_byte_perm_S (w1[1], w1[2], selector);
      w4[1] = hc_byte_perm_S (w1[0], w1[1], selector);
      w4[0] = hc_byte_perm_S (w0[3], w1[0], selector);
      w3[3] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm_S (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 13:
      w7[3] = hc_byte_perm_S (w4[1], w4[2], selector);
      w7[2] = hc_byte_perm_S (w4[0], w4[1], selector);
      w7[1] = hc_byte_perm_S (w3[3], w4[0], selector);
      w7[0] = hc_byte_perm_S (w3[2], w3[3], selector);
      w6[3] = hc_byte_perm_S (w3[1], w3[2], selector);
      w6[2] = hc_byte_perm_S (w3[0], w3[1], selector);
      w6[1] = hc_byte_perm_S (w2[3], w3[0], selector);
      w6[0] = hc_byte_perm_S (w2[2], w2[3], selector);
      w5[3] = hc_byte_perm_S (w2[1], w2[2], selector);
      w5[2] = hc_byte_perm_S (w2[0], w2[1], selector);
      w5[1] = hc_byte_perm_S (w1[3], w2[0], selector);
      w5[0] = hc_byte_perm_S (w1[2], w1[3], selector);
      w4[3] = hc_byte_perm_S (w1[1], w1[2], selector);
      w4[2] = hc_byte_perm_S (w1[0], w1[1], selector);
      w4[1] = hc_byte_perm_S (w0[3], w1[0], selector);
      w4[0] = hc_byte_perm_S (w0[2], w0[3], selector);
      w3[3] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm_S (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 14:
      w7[3] = hc_byte_perm_S (w4[0], w4[1], selector);
      w7[2] = hc_byte_perm_S (w3[3], w4[0], selector);
      w7[1] = hc_byte_perm_S (w3[2], w3[3], selector);
      w7[0] = hc_byte_perm_S (w3[1], w3[2], selector);
      w6[3] = hc_byte_perm_S (w3[0], w3[1], selector);
      w6[2] = hc_byte_perm_S (w2[3], w3[0], selector);
      w6[1] = hc_byte_perm_S (w2[2], w2[3], selector);
      w6[0] = hc_byte_perm_S (w2[1], w2[2], selector);
      w5[3] = hc_byte_perm_S (w2[0], w2[1], selector);
      w5[2] = hc_byte_perm_S (w1[3], w2[0], selector);
      w5[1] = hc_byte_perm_S (w1[2], w1[3], selector);
      w5[0] = hc_byte_perm_S (w1[1], w1[2], selector);
      w4[3] = hc_byte_perm_S (w1[0], w1[1], selector);
      w4[2] = hc_byte_perm_S (w0[3], w1[0], selector);
      w4[1] = hc_byte_perm_S (w0[2], w0[3], selector);
      w4[0] = hc_byte_perm_S (w0[1], w0[2], selector);
      w3[3] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm_S (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 15:
      w7[3] = hc_byte_perm_S (w3[3], w4[0], selector);
      w7[2] = hc_byte_perm_S (w3[2], w3[3], selector);
      w7[1] = hc_byte_perm_S (w3[1], w3[2], selector);
      w7[0] = hc_byte_perm_S (w3[0], w3[1], selector);
      w6[3] = hc_byte_perm_S (w2[3], w3[0], selector);
      w6[2] = hc_byte_perm_S (w2[2], w2[3], selector);
      w6[1] = hc_byte_perm_S (w2[1], w2[2], selector);
      w6[0] = hc_byte_perm_S (w2[0], w2[1], selector);
      w5[3] = hc_byte_perm_S (w1[3], w2[0], selector);
      w5[2] = hc_byte_perm_S (w1[2], w1[3], selector);
      w5[1] = hc_byte_perm_S (w1[1], w1[2], selector);
      w5[0] = hc_byte_perm_S (w1[0], w1[1], selector);
      w4[3] = hc_byte_perm_S (w0[3], w1[0], selector);
      w4[2] = hc_byte_perm_S (w0[2], w0[3], selector);
      w4[1] = hc_byte_perm_S (w0[1], w0[2], selector);
      w4[0] = hc_byte_perm_S (w0[0], w0[1], selector);
      w3[3] = hc_byte_perm_S (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign_S (w7[2], w7[3], offset);
      w7[2] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[1] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[0] = hc_bytealign_S (w6[3], w7[0], offset);
      w6[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w5[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w4[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[2] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[1] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[0] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[2] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[1] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[0] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[2] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[1] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[0] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[0] = hc_bytealign_S (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[1] = hc_bytealign_S (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[2] = hc_bytealign_S (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[3] = hc_bytealign_S (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[0] = hc_bytealign_S (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[1] = hc_bytealign_S (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[2] = hc_bytealign_S (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[3] = hc_bytealign_S (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[0] = hc_bytealign_S (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[1] = hc_bytealign_S (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[2] = hc_bytealign_S (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[3] = hc_bytealign_S (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[0] = hc_bytealign_S (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[1] = hc_bytealign_S (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[2] = hc_bytealign_S (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign_S (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm_S (w0[0],     0, selector);

      break;

    case  1:
      w7[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm_S (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm_S (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm_S (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm_S (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm_S (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm_S (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm_S (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm_S (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm_S (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm_S (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm_S (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm_S (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm_S (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm_S (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm_S (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm_S (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm_S (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm_S (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm_S (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm_S (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm_S (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm_S (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm_S (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm_S (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm_S (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm_S (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm_S (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm_S (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm_S (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm_S (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_byte_perm_S (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign_S (w7[3],     0, offset);
      w7[3] = hc_bytealign_S (w7[2], w7[3], offset);
      w7[2] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[1] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[0] = hc_bytealign_S (w6[3], w7[0], offset);
      w6[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w5[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w4[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w3[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_S (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign_S (w7[3],     0, offset);
      c0[0] = hc_bytealign_S (w7[2], w7[3], offset);
      w7[3] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[2] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[1] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[0] = hc_bytealign_S (w6[2], w6[3], offset);
      w6[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w5[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w4[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_S (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign_S (w7[3],     0, offset);
      c0[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c0[0] = hc_bytealign_S (w7[1], w7[2], offset);
      w7[3] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[2] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[1] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[0] = hc_bytealign_S (w6[1], w6[2], offset);
      w6[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w5[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w4[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_S (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign_S (w7[3],     0, offset);
      c0[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c0[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c0[0] = hc_bytealign_S (w7[0], w7[1], offset);
      w7[3] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[2] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[1] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[0] = hc_bytealign_S (w6[0], w6[1], offset);
      w6[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w5[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w4[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_S (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign_S (w7[3],     0, offset);
      c0[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c0[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c0[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c0[0] = hc_bytealign_S (w6[3], w7[0], offset);
      w7[3] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[2] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[1] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w6[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w5[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w4[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_S (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign_S (w7[3],     0, offset);
      c1[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c0[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c0[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c0[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c0[0] = hc_bytealign_S (w6[2], w6[3], offset);
      w7[3] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[2] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[1] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w6[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w5[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w4[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_S (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign_S (w7[3],     0, offset);
      c1[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c1[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c0[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c0[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c0[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c0[0] = hc_bytealign_S (w6[1], w6[2], offset);
      w7[3] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[2] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[1] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w6[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w5[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w4[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_S (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign_S (w7[3],     0, offset);
      c1[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c1[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c1[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c0[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c0[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c0[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c0[0] = hc_bytealign_S (w6[0], w6[1], offset);
      w7[3] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[2] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[1] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w6[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w5[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w4[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_S (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign_S (w7[3],     0, offset);
      c1[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c1[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c1[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c1[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c0[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c0[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c0[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c0[0] = hc_bytealign_S (w5[3], w6[0], offset);
      w7[3] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[2] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[1] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w6[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w5[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w4[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_S (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign_S (w7[3],     0, offset);
      c2[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c1[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c1[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c1[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c1[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c0[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c0[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c0[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c0[0] = hc_bytealign_S (w5[2], w5[3], offset);
      w7[3] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[2] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[1] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w6[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w5[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w4[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_S (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign_S (w7[3],     0, offset);
      c2[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c2[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c1[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c1[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c1[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c1[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c0[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c0[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c0[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c0[0] = hc_bytealign_S (w5[1], w5[2], offset);
      w7[3] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[2] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[1] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w6[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w5[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w4[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_S (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign_S (w7[3],     0, offset);
      c2[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c2[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c2[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c1[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c1[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c1[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c1[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c0[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c0[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c0[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c0[0] = hc_bytealign_S (w5[0], w5[1], offset);
      w7[3] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[2] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[1] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w6[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w5[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w4[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_S (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign_S (w7[3],     0, offset);
      c2[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c2[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c2[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c2[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c1[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c1[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c1[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c1[0] = hc_bytealign_S (w5[3], w6[0], offset);
      c0[3] = hc_bytealign_S (w5[2], w5[3], offset);
      c0[2] = hc_bytealign_S (w5[1], w5[2], offset);
      c0[1] = hc_bytealign_S (w5[0], w5[1], offset);
      c0[0] = hc_bytealign_S (w4[3], w5[0], offset);
      w7[3] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[2] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[1] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w6[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w5[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w4[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_S (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign_S (w7[3],     0, offset);
      c3[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c2[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c2[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c2[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c2[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c1[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c1[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c1[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c1[0] = hc_bytealign_S (w5[2], w5[3], offset);
      c0[3] = hc_bytealign_S (w5[1], w5[2], offset);
      c0[2] = hc_bytealign_S (w5[0], w5[1], offset);
      c0[1] = hc_bytealign_S (w4[3], w5[0], offset);
      c0[0] = hc_bytealign_S (w4[2], w4[3], offset);
      w7[3] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[2] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[1] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w6[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w5[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w4[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_S (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign_S (w7[3],     0, offset);
      c3[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c3[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c2[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c2[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c2[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c2[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c1[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c1[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c1[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c1[0] = hc_bytealign_S (w5[1], w5[2], offset);
      c0[3] = hc_bytealign_S (w5[0], w5[1], offset);
      c0[2] = hc_bytealign_S (w4[3], w5[0], offset);
      c0[1] = hc_bytealign_S (w4[2], w4[3], offset);
      c0[0] = hc_bytealign_S (w4[1], w4[2], offset);
      w7[3] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[2] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[1] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w6[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w5[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w4[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_S (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign_S (w7[3],     0, offset);
      c3[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c3[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c3[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c2[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c2[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c2[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c2[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c1[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c1[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c1[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c1[0] = hc_bytealign_S (w5[0], w5[1], offset);
      c0[3] = hc_bytealign_S (w4[3], w5[0], offset);
      c0[2] = hc_bytealign_S (w4[2], w4[3], offset);
      c0[1] = hc_bytealign_S (w4[1], w4[2], offset);
      c0[0] = hc_bytealign_S (w4[0], w4[1], offset);
      w7[3] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[2] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[1] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w6[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w5[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w4[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_S (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_bytealign_S (w7[3],     0, offset);
      c3[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c3[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c3[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c3[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c2[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c2[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c2[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c2[0] = hc_bytealign_S (w5[3], w6[0], offset);
      c1[3] = hc_bytealign_S (w5[2], w5[3], offset);
      c1[2] = hc_bytealign_S (w5[1], w5[2], offset);
      c1[1] = hc_bytealign_S (w5[0], w5[1], offset);
      c1[0] = hc_bytealign_S (w4[3], w5[0], offset);
      c0[3] = hc_bytealign_S (w4[2], w4[3], offset);
      c0[2] = hc_bytealign_S (w4[1], w4[2], offset);
      c0[1] = hc_bytealign_S (w4[0], w4[1], offset);
      c0[0] = hc_bytealign_S (w3[3], w4[0], offset);
      w7[3] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[2] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[1] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w6[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w5[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w4[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[0] = hc_bytealign_S (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_bytealign_S (w7[3],     0, offset);
      c4[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c3[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c3[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c3[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c3[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c2[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c2[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c2[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c2[0] = hc_bytealign_S (w5[2], w5[3], offset);
      c1[3] = hc_bytealign_S (w5[1], w5[2], offset);
      c1[2] = hc_bytealign_S (w5[0], w5[1], offset);
      c1[1] = hc_bytealign_S (w4[3], w5[0], offset);
      c1[0] = hc_bytealign_S (w4[2], w4[3], offset);
      c0[3] = hc_bytealign_S (w4[1], w4[2], offset);
      c0[2] = hc_bytealign_S (w4[0], w4[1], offset);
      c0[1] = hc_bytealign_S (w3[3], w4[0], offset);
      c0[0] = hc_bytealign_S (w3[2], w3[3], offset);
      w7[3] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[2] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[1] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w6[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w5[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w4[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[1] = hc_bytealign_S (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_bytealign_S (w7[3],     0, offset);
      c4[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c4[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c3[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c3[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c3[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c3[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c2[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c2[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c2[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c2[0] = hc_bytealign_S (w5[1], w5[2], offset);
      c1[3] = hc_bytealign_S (w5[0], w5[1], offset);
      c1[2] = hc_bytealign_S (w4[3], w5[0], offset);
      c1[1] = hc_bytealign_S (w4[2], w4[3], offset);
      c1[0] = hc_bytealign_S (w4[1], w4[2], offset);
      c0[3] = hc_bytealign_S (w4[0], w4[1], offset);
      c0[2] = hc_bytealign_S (w3[3], w4[0], offset);
      c0[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[0] = hc_bytealign_S (w3[1], w3[2], offset);
      w7[3] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[2] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[1] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w6[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w5[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w4[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[2] = hc_bytealign_S (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_bytealign_S (w7[3],     0, offset);
      c4[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c4[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c4[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c3[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c3[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c3[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c3[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c2[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c2[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c2[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c2[0] = hc_bytealign_S (w5[0], w5[1], offset);
      c1[3] = hc_bytealign_S (w4[3], w5[0], offset);
      c1[2] = hc_bytealign_S (w4[2], w4[3], offset);
      c1[1] = hc_bytealign_S (w4[1], w4[2], offset);
      c1[0] = hc_bytealign_S (w4[0], w4[1], offset);
      c0[3] = hc_bytealign_S (w3[3], w4[0], offset);
      c0[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[0] = hc_bytealign_S (w3[0], w3[1], offset);
      w7[3] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[2] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[1] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w6[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w5[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w4[3] = hc_bytealign_S (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_bytealign_S (w7[3],     0, offset);
      c4[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c4[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c4[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c4[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c3[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c3[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c3[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c3[0] = hc_bytealign_S (w5[3], w6[0], offset);
      c2[3] = hc_bytealign_S (w5[2], w5[3], offset);
      c2[2] = hc_bytealign_S (w5[1], w5[2], offset);
      c2[1] = hc_bytealign_S (w5[0], w5[1], offset);
      c2[0] = hc_bytealign_S (w4[3], w5[0], offset);
      c1[3] = hc_bytealign_S (w4[2], w4[3], offset);
      c1[2] = hc_bytealign_S (w4[1], w4[2], offset);
      c1[1] = hc_bytealign_S (w4[0], w4[1], offset);
      c1[0] = hc_bytealign_S (w3[3], w4[0], offset);
      c0[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[0] = hc_bytealign_S (w2[3], w3[0], offset);
      w7[3] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[2] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[1] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w6[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w5[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[0] = hc_bytealign_S (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_bytealign_S (w7[3],     0, offset);
      c5[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c4[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c4[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c4[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c4[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c3[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c3[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c3[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c3[0] = hc_bytealign_S (w5[2], w5[3], offset);
      c2[3] = hc_bytealign_S (w5[1], w5[2], offset);
      c2[2] = hc_bytealign_S (w5[0], w5[1], offset);
      c2[1] = hc_bytealign_S (w4[3], w5[0], offset);
      c2[0] = hc_bytealign_S (w4[2], w4[3], offset);
      c1[3] = hc_bytealign_S (w4[1], w4[2], offset);
      c1[2] = hc_bytealign_S (w4[0], w4[1], offset);
      c1[1] = hc_bytealign_S (w3[3], w4[0], offset);
      c1[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c0[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[0] = hc_bytealign_S (w2[2], w2[3], offset);
      w7[3] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[2] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[1] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w6[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w5[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[1] = hc_bytealign_S (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_bytealign_S (w7[3],     0, offset);
      c5[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c5[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c4[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c4[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c4[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c4[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c3[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c3[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c3[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c3[0] = hc_bytealign_S (w5[1], w5[2], offset);
      c2[3] = hc_bytealign_S (w5[0], w5[1], offset);
      c2[2] = hc_bytealign_S (w4[3], w5[0], offset);
      c2[1] = hc_bytealign_S (w4[2], w4[3], offset);
      c2[0] = hc_bytealign_S (w4[1], w4[2], offset);
      c1[3] = hc_bytealign_S (w4[0], w4[1], offset);
      c1[2] = hc_bytealign_S (w3[3], w4[0], offset);
      c1[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c0[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[0] = hc_bytealign_S (w2[1], w2[2], offset);
      w7[3] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[2] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[1] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w6[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w5[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[2] = hc_bytealign_S (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_bytealign_S (w7[3],     0, offset);
      c5[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c5[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c5[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c4[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c4[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c4[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c4[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c3[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c3[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c3[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c3[0] = hc_bytealign_S (w5[0], w5[1], offset);
      c2[3] = hc_bytealign_S (w4[3], w5[0], offset);
      c2[2] = hc_bytealign_S (w4[2], w4[3], offset);
      c2[1] = hc_bytealign_S (w4[1], w4[2], offset);
      c2[0] = hc_bytealign_S (w4[0], w4[1], offset);
      c1[3] = hc_bytealign_S (w3[3], w4[0], offset);
      c1[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c0[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[0] = hc_bytealign_S (w2[0], w2[1], offset);
      w7[3] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[2] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[1] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w6[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w5[3] = hc_bytealign_S (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_bytealign_S (w7[3],     0, offset);
      c5[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c5[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c5[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c5[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c4[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c4[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c4[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c4[0] = hc_bytealign_S (w5[3], w6[0], offset);
      c3[3] = hc_bytealign_S (w5[2], w5[3], offset);
      c3[2] = hc_bytealign_S (w5[1], w5[2], offset);
      c3[1] = hc_bytealign_S (w5[0], w5[1], offset);
      c3[0] = hc_bytealign_S (w4[3], w5[0], offset);
      c2[3] = hc_bytealign_S (w4[2], w4[3], offset);
      c2[2] = hc_bytealign_S (w4[1], w4[2], offset);
      c2[1] = hc_bytealign_S (w4[0], w4[1], offset);
      c2[0] = hc_bytealign_S (w3[3], w4[0], offset);
      c1[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c0[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[0] = hc_bytealign_S (w1[3], w2[0], offset);
      w7[3] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[2] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[1] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w6[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[0] = hc_bytealign_S (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_bytealign_S (w7[3],     0, offset);
      c6[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c5[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c5[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c5[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c5[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c4[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c4[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c4[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c4[0] = hc_bytealign_S (w5[2], w5[3], offset);
      c3[3] = hc_bytealign_S (w5[1], w5[2], offset);
      c3[2] = hc_bytealign_S (w5[0], w5[1], offset);
      c3[1] = hc_bytealign_S (w4[3], w5[0], offset);
      c3[0] = hc_bytealign_S (w4[2], w4[3], offset);
      c2[3] = hc_bytealign_S (w4[1], w4[2], offset);
      c2[2] = hc_bytealign_S (w4[0], w4[1], offset);
      c2[1] = hc_bytealign_S (w3[3], w4[0], offset);
      c2[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c1[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c0[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[0] = hc_bytealign_S (w1[2], w1[3], offset);
      w7[3] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[2] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[1] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w6[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[1] = hc_bytealign_S (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_bytealign_S (w7[3],     0, offset);
      c6[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c6[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c5[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c5[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c5[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c5[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c4[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c4[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c4[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c4[0] = hc_bytealign_S (w5[1], w5[2], offset);
      c3[3] = hc_bytealign_S (w5[0], w5[1], offset);
      c3[2] = hc_bytealign_S (w4[3], w5[0], offset);
      c3[1] = hc_bytealign_S (w4[2], w4[3], offset);
      c3[0] = hc_bytealign_S (w4[1], w4[2], offset);
      c2[3] = hc_bytealign_S (w4[0], w4[1], offset);
      c2[2] = hc_bytealign_S (w3[3], w4[0], offset);
      c2[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c1[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c0[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[0] = hc_bytealign_S (w1[1], w1[2], offset);
      w7[3] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[2] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[1] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w6[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[2] = hc_bytealign_S (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_bytealign_S (w7[3],     0, offset);
      c6[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c6[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c6[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c5[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c5[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c5[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c5[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c4[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c4[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c4[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c4[0] = hc_bytealign_S (w5[0], w5[1], offset);
      c3[3] = hc_bytealign_S (w4[3], w5[0], offset);
      c3[2] = hc_bytealign_S (w4[2], w4[3], offset);
      c3[1] = hc_bytealign_S (w4[1], w4[2], offset);
      c3[0] = hc_bytealign_S (w4[0], w4[1], offset);
      c2[3] = hc_bytealign_S (w3[3], w4[0], offset);
      c2[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c1[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c0[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[0] = hc_bytealign_S (w1[0], w1[1], offset);
      w7[3] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[2] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[1] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w6[3] = hc_bytealign_S (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_bytealign_S (w7[3],     0, offset);
      c6[3] = hc_bytealign_S (w7[2], w7[3], offset);
      c6[2] = hc_bytealign_S (w7[1], w7[2], offset);
      c6[1] = hc_bytealign_S (w7[0], w7[1], offset);
      c6[0] = hc_bytealign_S (w6[3], w7[0], offset);
      c5[3] = hc_bytealign_S (w6[2], w6[3], offset);
      c5[2] = hc_bytealign_S (w6[1], w6[2], offset);
      c5[1] = hc_bytealign_S (w6[0], w6[1], offset);
      c5[0] = hc_bytealign_S (w5[3], w6[0], offset);
      c4[3] = hc_bytealign_S (w5[2], w5[3], offset);
      c4[2] = hc_bytealign_S (w5[1], w5[2], offset);
      c4[1] = hc_bytealign_S (w5[0], w5[1], offset);
      c4[0] = hc_bytealign_S (w4[3], w5[0], offset);
      c3[3] = hc_bytealign_S (w4[2], w4[3], offset);
      c3[2] = hc_bytealign_S (w4[1], w4[2], offset);
      c3[1] = hc_bytealign_S (w4[0], w4[1], offset);
      c3[0] = hc_bytealign_S (w3[3], w4[0], offset);
      c2[3] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[2] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[1] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[0] = hc_bytealign_S (w2[3], w3[0], offset);
      c1[3] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[2] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[1] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[0] = hc_bytealign_S (w1[3], w2[0], offset);
      c0[3] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[2] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[1] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[0] = hc_bytealign_S (w0[3], w1[0], offset);
      w7[3] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[2] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[1] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[0] = hc_bytealign_S (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_bytealign_S (w7[3],     0, offset);
      c7[0] = hc_bytealign_S (w7[2], w7[3], offset);
      c6[3] = hc_bytealign_S (w7[1], w7[2], offset);
      c6[2] = hc_bytealign_S (w7[0], w7[1], offset);
      c6[1] = hc_bytealign_S (w6[3], w7[0], offset);
      c6[0] = hc_bytealign_S (w6[2], w6[3], offset);
      c5[3] = hc_bytealign_S (w6[1], w6[2], offset);
      c5[2] = hc_bytealign_S (w6[0], w6[1], offset);
      c5[1] = hc_bytealign_S (w5[3], w6[0], offset);
      c5[0] = hc_bytealign_S (w5[2], w5[3], offset);
      c4[3] = hc_bytealign_S (w5[1], w5[2], offset);
      c4[2] = hc_bytealign_S (w5[0], w5[1], offset);
      c4[1] = hc_bytealign_S (w4[3], w5[0], offset);
      c4[0] = hc_bytealign_S (w4[2], w4[3], offset);
      c3[3] = hc_bytealign_S (w4[1], w4[2], offset);
      c3[2] = hc_bytealign_S (w4[0], w4[1], offset);
      c3[1] = hc_bytealign_S (w3[3], w4[0], offset);
      c3[0] = hc_bytealign_S (w3[2], w3[3], offset);
      c2[3] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[2] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[1] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[0] = hc_bytealign_S (w2[2], w2[3], offset);
      c1[3] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[2] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[1] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[0] = hc_bytealign_S (w1[2], w1[3], offset);
      c0[3] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[2] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[1] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[0] = hc_bytealign_S (w0[2], w0[3], offset);
      w7[3] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[2] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[1] = hc_bytealign_S (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_bytealign_S (w7[3],     0, offset);
      c7[1] = hc_bytealign_S (w7[2], w7[3], offset);
      c7[0] = hc_bytealign_S (w7[1], w7[2], offset);
      c6[3] = hc_bytealign_S (w7[0], w7[1], offset);
      c6[2] = hc_bytealign_S (w6[3], w7[0], offset);
      c6[1] = hc_bytealign_S (w6[2], w6[3], offset);
      c6[0] = hc_bytealign_S (w6[1], w6[2], offset);
      c5[3] = hc_bytealign_S (w6[0], w6[1], offset);
      c5[2] = hc_bytealign_S (w5[3], w6[0], offset);
      c5[1] = hc_bytealign_S (w5[2], w5[3], offset);
      c5[0] = hc_bytealign_S (w5[1], w5[2], offset);
      c4[3] = hc_bytealign_S (w5[0], w5[1], offset);
      c4[2] = hc_bytealign_S (w4[3], w5[0], offset);
      c4[1] = hc_bytealign_S (w4[2], w4[3], offset);
      c4[0] = hc_bytealign_S (w4[1], w4[2], offset);
      c3[3] = hc_bytealign_S (w4[0], w4[1], offset);
      c3[2] = hc_bytealign_S (w3[3], w4[0], offset);
      c3[1] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[0] = hc_bytealign_S (w3[1], w3[2], offset);
      c2[3] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[2] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[1] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[0] = hc_bytealign_S (w2[1], w2[2], offset);
      c1[3] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[2] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[1] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[0] = hc_bytealign_S (w1[1], w1[2], offset);
      c0[3] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[2] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[1] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[0] = hc_bytealign_S (w0[1], w0[2], offset);
      w7[3] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[2] = hc_bytealign_S (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_bytealign_S (w7[3],     0, offset);
      c7[2] = hc_bytealign_S (w7[2], w7[3], offset);
      c7[1] = hc_bytealign_S (w7[1], w7[2], offset);
      c7[0] = hc_bytealign_S (w7[0], w7[1], offset);
      c6[3] = hc_bytealign_S (w6[3], w7[0], offset);
      c6[2] = hc_bytealign_S (w6[2], w6[3], offset);
      c6[1] = hc_bytealign_S (w6[1], w6[2], offset);
      c6[0] = hc_bytealign_S (w6[0], w6[1], offset);
      c5[3] = hc_bytealign_S (w5[3], w6[0], offset);
      c5[2] = hc_bytealign_S (w5[2], w5[3], offset);
      c5[1] = hc_bytealign_S (w5[1], w5[2], offset);
      c5[0] = hc_bytealign_S (w5[0], w5[1], offset);
      c4[3] = hc_bytealign_S (w4[3], w5[0], offset);
      c4[2] = hc_bytealign_S (w4[2], w4[3], offset);
      c4[1] = hc_bytealign_S (w4[1], w4[2], offset);
      c4[0] = hc_bytealign_S (w4[0], w4[1], offset);
      c3[3] = hc_bytealign_S (w3[3], w4[0], offset);
      c3[2] = hc_bytealign_S (w3[2], w3[3], offset);
      c3[1] = hc_bytealign_S (w3[1], w3[2], offset);
      c3[0] = hc_bytealign_S (w3[0], w3[1], offset);
      c2[3] = hc_bytealign_S (w2[3], w3[0], offset);
      c2[2] = hc_bytealign_S (w2[2], w2[3], offset);
      c2[1] = hc_bytealign_S (w2[1], w2[2], offset);
      c2[0] = hc_bytealign_S (w2[0], w2[1], offset);
      c1[3] = hc_bytealign_S (w1[3], w2[0], offset);
      c1[2] = hc_bytealign_S (w1[2], w1[3], offset);
      c1[1] = hc_bytealign_S (w1[1], w1[2], offset);
      c1[0] = hc_bytealign_S (w1[0], w1[1], offset);
      c0[3] = hc_bytealign_S (w0[3], w1[0], offset);
      c0[2] = hc_bytealign_S (w0[2], w0[3], offset);
      c0[1] = hc_bytealign_S (w0[1], w0[2], offset);
      c0[0] = hc_bytealign_S (w0[0], w0[1], offset);
      w7[3] = hc_bytealign_S (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm_S (    0, w7[3], selector);
      w7[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm_S (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm_S (    0, w7[3], selector);
      c0[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      w7[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm_S (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm_S (    0, w7[3], selector);
      c0[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c0[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      w7[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm_S (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm_S (    0, w7[3], selector);
      c0[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c0[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c0[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      w7[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm_S (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm_S (    0, w7[3], selector);
      c0[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c0[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c0[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c0[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      w7[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm_S (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm_S (    0, w7[3], selector);
      c1[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c0[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c0[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c0[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c0[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      w7[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm_S (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm_S (    0, w7[3], selector);
      c1[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c1[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c0[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c0[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c0[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c0[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      w7[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm_S (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm_S (    0, w7[3], selector);
      c1[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c1[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c1[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c0[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c0[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c0[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c0[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      w7[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm_S (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm_S (    0, w7[3], selector);
      c1[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c1[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c1[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c1[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c0[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c0[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c0[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c0[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      w7[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm_S (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm_S (    0, w7[3], selector);
      c2[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c1[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c1[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c1[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c1[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c0[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c0[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c0[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c0[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      w7[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm_S (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm_S (    0, w7[3], selector);
      c2[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c2[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c1[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c1[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c1[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c1[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c0[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c0[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c0[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c0[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      w7[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm_S (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm_S (    0, w7[3], selector);
      c2[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c2[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c2[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c1[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c1[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c1[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c1[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c0[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c0[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c0[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c0[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      w7[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm_S (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm_S (    0, w7[3], selector);
      c2[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c2[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c2[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c2[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c1[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c1[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c1[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c1[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      c0[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      c0[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      c0[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      c0[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      w7[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm_S (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm_S (    0, w7[3], selector);
      c3[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c2[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c2[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c2[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c2[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c1[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c1[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c1[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c1[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      c0[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      c0[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      c0[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      c0[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      w7[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm_S (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm_S (    0, w7[3], selector);
      c3[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c3[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c2[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c2[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c2[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c2[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c1[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c1[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c1[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c1[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      c0[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      c0[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      c0[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      c0[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      w7[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm_S (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm_S (    0, w7[3], selector);
      c3[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c3[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c3[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c2[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c2[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c2[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c2[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c1[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c1[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c1[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c1[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      c0[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      c0[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      c0[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      c0[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      w7[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm_S (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_byte_perm_S (    0, w7[3], selector);
      c3[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c3[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c3[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c3[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c2[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c2[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c2[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c2[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      c1[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      c1[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      c1[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      c1[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      c0[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      c0[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      c0[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      c0[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      w7[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm_S (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_byte_perm_S (    0, w7[3], selector);
      c4[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c3[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c3[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c3[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c3[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c2[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c2[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c2[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c2[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      c1[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      c1[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      c1[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      c1[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      c0[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      c0[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      c0[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      c0[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      w7[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm_S (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_byte_perm_S (    0, w7[3], selector);
      c4[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c4[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c3[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c3[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c3[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c3[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c2[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c2[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c2[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c2[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      c1[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      c1[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      c1[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      c1[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      c0[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      c0[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      c0[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      w7[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm_S (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_byte_perm_S (    0, w7[3], selector);
      c4[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c4[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c4[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c3[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c3[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c3[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c3[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c2[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c2[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c2[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c2[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      c1[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      c1[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      c1[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      c1[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      c0[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      c0[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      w7[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm_S (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_byte_perm_S (    0, w7[3], selector);
      c4[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c4[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c4[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c4[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c3[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c3[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c3[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c3[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      c2[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      c2[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      c2[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      c2[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      c1[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      c1[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      c1[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      c1[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      c0[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      w7[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm_S (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_byte_perm_S (    0, w7[3], selector);
      c5[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c4[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c4[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c4[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c4[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c3[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c3[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c3[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c3[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      c2[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      c2[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      c2[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      c2[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      c1[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      c1[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      c1[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      c1[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      w7[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm_S (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_byte_perm_S (    0, w7[3], selector);
      c5[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c5[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c4[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c4[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c4[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c4[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c3[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c3[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c3[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c3[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      c2[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      c2[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      c2[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      c2[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      c1[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      c1[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      c1[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      w7[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm_S (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_byte_perm_S (    0, w7[3], selector);
      c5[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c5[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c5[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c4[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c4[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c4[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c4[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c3[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c3[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c3[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c3[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      c2[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      c2[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      c2[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      c2[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      c1[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      c1[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      w7[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm_S (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_byte_perm_S (    0, w7[3], selector);
      c5[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c5[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c5[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c5[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c4[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c4[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c4[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c4[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      c3[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      c3[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      c3[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      c3[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      c2[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      c2[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      c2[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      c2[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      c1[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      w7[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm_S (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_byte_perm_S (    0, w7[3], selector);
      c6[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c5[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c5[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c5[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c5[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c4[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c4[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c4[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c4[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      c3[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      c3[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      c3[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      c3[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      c2[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      c2[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      c2[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      c2[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      w7[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm_S (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_byte_perm_S (    0, w7[3], selector);
      c6[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c6[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c5[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c5[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c5[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c5[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c4[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c4[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c4[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c4[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      c3[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      c3[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      c3[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      c3[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      c2[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      c2[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      c2[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      w7[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm_S (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_byte_perm_S (    0, w7[3], selector);
      c6[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c6[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c6[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c5[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c5[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c5[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c5[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c4[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c4[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c4[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c4[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      c3[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      c3[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      c3[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      c3[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      c2[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      c2[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      w7[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm_S (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_byte_perm_S (    0, w7[3], selector);
      c6[3] = hc_byte_perm_S (w7[3], w7[2], selector);
      c6[2] = hc_byte_perm_S (w7[2], w7[1], selector);
      c6[1] = hc_byte_perm_S (w7[1], w7[0], selector);
      c6[0] = hc_byte_perm_S (w7[0], w6[3], selector);
      c5[3] = hc_byte_perm_S (w6[3], w6[2], selector);
      c5[2] = hc_byte_perm_S (w6[2], w6[1], selector);
      c5[1] = hc_byte_perm_S (w6[1], w6[0], selector);
      c5[0] = hc_byte_perm_S (w6[0], w5[3], selector);
      c4[3] = hc_byte_perm_S (w5[3], w5[2], selector);
      c4[2] = hc_byte_perm_S (w5[2], w5[1], selector);
      c4[1] = hc_byte_perm_S (w5[1], w5[0], selector);
      c4[0] = hc_byte_perm_S (w5[0], w4[3], selector);
      c3[3] = hc_byte_perm_S (w4[3], w4[2], selector);
      c3[2] = hc_byte_perm_S (w4[2], w4[1], selector);
      c3[1] = hc_byte_perm_S (w4[1], w4[0], selector);
      c3[0] = hc_byte_perm_S (w4[0], w3[3], selector);
      c2[3] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm_S (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm_S (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm_S (w1[0], w0[3], selector);
      w7[3] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm_S (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_byte_perm_S (    0, w7[3], selector);
      c7[0] = hc_byte_perm_S (w7[3], w7[2], selector);
      c6[3] = hc_byte_perm_S (w7[2], w7[1], selector);
      c6[2] = hc_byte_perm_S (w7[1], w7[0], selector);
      c6[1] = hc_byte_perm_S (w7[0], w6[3], selector);
      c6[0] = hc_byte_perm_S (w6[3], w6[2], selector);
      c5[3] = hc_byte_perm_S (w6[2], w6[1], selector);
      c5[2] = hc_byte_perm_S (w6[1], w6[0], selector);
      c5[1] = hc_byte_perm_S (w6[0], w5[3], selector);
      c5[0] = hc_byte_perm_S (w5[3], w5[2], selector);
      c4[3] = hc_byte_perm_S (w5[2], w5[1], selector);
      c4[2] = hc_byte_perm_S (w5[1], w5[0], selector);
      c4[1] = hc_byte_perm_S (w5[0], w4[3], selector);
      c4[0] = hc_byte_perm_S (w4[3], w4[2], selector);
      c3[3] = hc_byte_perm_S (w4[2], w4[1], selector);
      c3[2] = hc_byte_perm_S (w4[1], w4[0], selector);
      c3[1] = hc_byte_perm_S (w4[0], w3[3], selector);
      c3[0] = hc_byte_perm_S (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm_S (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm_S (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm_S (w0[3], w0[2], selector);
      w7[3] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm_S (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_byte_perm_S (    0, w7[3], selector);
      c7[1] = hc_byte_perm_S (w7[3], w7[2], selector);
      c7[0] = hc_byte_perm_S (w7[2], w7[1], selector);
      c6[3] = hc_byte_perm_S (w7[1], w7[0], selector);
      c6[2] = hc_byte_perm_S (w7[0], w6[3], selector);
      c6[1] = hc_byte_perm_S (w6[3], w6[2], selector);
      c6[0] = hc_byte_perm_S (w6[2], w6[1], selector);
      c5[3] = hc_byte_perm_S (w6[1], w6[0], selector);
      c5[2] = hc_byte_perm_S (w6[0], w5[3], selector);
      c5[1] = hc_byte_perm_S (w5[3], w5[2], selector);
      c5[0] = hc_byte_perm_S (w5[2], w5[1], selector);
      c4[3] = hc_byte_perm_S (w5[1], w5[0], selector);
      c4[2] = hc_byte_perm_S (w5[0], w4[3], selector);
      c4[1] = hc_byte_perm_S (w4[3], w4[2], selector);
      c4[0] = hc_byte_perm_S (w4[2], w4[1], selector);
      c3[3] = hc_byte_perm_S (w4[1], w4[0], selector);
      c3[2] = hc_byte_perm_S (w4[0], w3[3], selector);
      c3[1] = hc_byte_perm_S (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm_S (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm_S (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm_S (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm_S (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm_S (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm_S (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm_S (w0[2], w0[1], selector);
      w7[3] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm_S (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_byte_perm_S (    0, w7[3], selector);
      c7[2] = hc_byte_perm_S (w7[3], w7[2], selector);
      c7[1] = hc_byte_perm_S (w7[2], w7[1], selector);
      c7[0] = hc_byte_perm_S (w7[1], w7[0], selector);
      c6[3] = hc_byte_perm_S (w7[0], w6[3], selector);
      c6[2] = hc_byte_perm_S (w6[3], w6[2], selector);
      c6[1] = hc_byte_perm_S (w6[2], w6[1], selector);
      c6[0] = hc_byte_perm_S (w6[1], w6[0], selector);
      c5[3] = hc_byte_perm_S (w6[0], w5[3], selector);
      c5[2] = hc_byte_perm_S (w5[3], w5[2], selector);
      c5[1] = hc_byte_perm_S (w5[2], w5[1], selector);
      c5[0] = hc_byte_perm_S (w5[1], w5[0], selector);
      c4[3] = hc_byte_perm_S (w5[0], w4[3], selector);
      c4[2] = hc_byte_perm_S (w4[3], w4[2], selector);
      c4[1] = hc_byte_perm_S (w4[2], w4[1], selector);
      c4[0] = hc_byte_perm_S (w4[1], w4[0], selector);
      c3[3] = hc_byte_perm_S (w4[0], w3[3], selector);
      c3[2] = hc_byte_perm_S (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm_S (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm_S (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm_S (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm_S (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm_S (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm_S (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm_S (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm_S (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm_S (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm_S (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm_S (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm_S (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm_S (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm_S (w0[1], w0[0], selector);
      w7[3] = hc_byte_perm_S (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset)
{
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC

  #pragma unroll
  for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]);

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign_S (w[62], w[63], offset);
      w[62] = hc_bytealign_S (w[61], w[62], offset);
      w[61] = hc_bytealign_S (w[60], w[61], offset);
      w[60] = hc_bytealign_S (w[59], w[60], offset);
      w[59] = hc_bytealign_S (w[58], w[59], offset);
      w[58] = hc_bytealign_S (w[57], w[58], offset);
      w[57] = hc_bytealign_S (w[56], w[57], offset);
      w[56] = hc_bytealign_S (w[55], w[56], offset);
      w[55] = hc_bytealign_S (w[54], w[55], offset);
      w[54] = hc_bytealign_S (w[53], w[54], offset);
      w[53] = hc_bytealign_S (w[52], w[53], offset);
      w[52] = hc_bytealign_S (w[51], w[52], offset);
      w[51] = hc_bytealign_S (w[50], w[51], offset);
      w[50] = hc_bytealign_S (w[49], w[50], offset);
      w[49] = hc_bytealign_S (w[48], w[49], offset);
      w[48] = hc_bytealign_S (w[47], w[48], offset);
      w[47] = hc_bytealign_S (w[46], w[47], offset);
      w[46] = hc_bytealign_S (w[45], w[46], offset);
      w[45] = hc_bytealign_S (w[44], w[45], offset);
      w[44] = hc_bytealign_S (w[43], w[44], offset);
      w[43] = hc_bytealign_S (w[42], w[43], offset);
      w[42] = hc_bytealign_S (w[41], w[42], offset);
      w[41] = hc_bytealign_S (w[40], w[41], offset);
      w[40] = hc_bytealign_S (w[39], w[40], offset);
      w[39] = hc_bytealign_S (w[38], w[39], offset);
      w[38] = hc_bytealign_S (w[37], w[38], offset);
      w[37] = hc_bytealign_S (w[36], w[37], offset);
      w[36] = hc_bytealign_S (w[35], w[36], offset);
      w[35] = hc_bytealign_S (w[34], w[35], offset);
      w[34] = hc_bytealign_S (w[33], w[34], offset);
      w[33] = hc_bytealign_S (w[32], w[33], offset);
      w[32] = hc_bytealign_S (w[31], w[32], offset);
      w[31] = hc_bytealign_S (w[30], w[31], offset);
      w[30] = hc_bytealign_S (w[29], w[30], offset);
      w[29] = hc_bytealign_S (w[28], w[29], offset);
      w[28] = hc_bytealign_S (w[27], w[28], offset);
      w[27] = hc_bytealign_S (w[26], w[27], offset);
      w[26] = hc_bytealign_S (w[25], w[26], offset);
      w[25] = hc_bytealign_S (w[24], w[25], offset);
      w[24] = hc_bytealign_S (w[23], w[24], offset);
      w[23] = hc_bytealign_S (w[22], w[23], offset);
      w[22] = hc_bytealign_S (w[21], w[22], offset);
      w[21] = hc_bytealign_S (w[20], w[21], offset);
      w[20] = hc_bytealign_S (w[19], w[20], offset);
      w[19] = hc_bytealign_S (w[18], w[19], offset);
      w[18] = hc_bytealign_S (w[17], w[18], offset);
      w[17] = hc_bytealign_S (w[16], w[17], offset);
      w[16] = hc_bytealign_S (w[15], w[16], offset);
      w[15] = hc_bytealign_S (w[14], w[15], offset);
      w[14] = hc_bytealign_S (w[13], w[14], offset);
      w[13] = hc_bytealign_S (w[12], w[13], offset);
      w[12] = hc_bytealign_S (w[11], w[12], offset);
      w[11] = hc_bytealign_S (w[10], w[11], offset);
      w[10] = hc_bytealign_S (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign_S (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign_S (w[61], w[62], offset);
      w[62] = hc_bytealign_S (w[60], w[61], offset);
      w[61] = hc_bytealign_S (w[59], w[60], offset);
      w[60] = hc_bytealign_S (w[58], w[59], offset);
      w[59] = hc_bytealign_S (w[57], w[58], offset);
      w[58] = hc_bytealign_S (w[56], w[57], offset);
      w[57] = hc_bytealign_S (w[55], w[56], offset);
      w[56] = hc_bytealign_S (w[54], w[55], offset);
      w[55] = hc_bytealign_S (w[53], w[54], offset);
      w[54] = hc_bytealign_S (w[52], w[53], offset);
      w[53] = hc_bytealign_S (w[51], w[52], offset);
      w[52] = hc_bytealign_S (w[50], w[51], offset);
      w[51] = hc_bytealign_S (w[49], w[50], offset);
      w[50] = hc_bytealign_S (w[48], w[49], offset);
      w[49] = hc_bytealign_S (w[47], w[48], offset);
      w[48] = hc_bytealign_S (w[46], w[47], offset);
      w[47] = hc_bytealign_S (w[45], w[46], offset);
      w[46] = hc_bytealign_S (w[44], w[45], offset);
      w[45] = hc_bytealign_S (w[43], w[44], offset);
      w[44] = hc_bytealign_S (w[42], w[43], offset);
      w[43] = hc_bytealign_S (w[41], w[42], offset);
      w[42] = hc_bytealign_S (w[40], w[41], offset);
      w[41] = hc_bytealign_S (w[39], w[40], offset);
      w[40] = hc_bytealign_S (w[38], w[39], offset);
      w[39] = hc_bytealign_S (w[37], w[38], offset);
      w[38] = hc_bytealign_S (w[36], w[37], offset);
      w[37] = hc_bytealign_S (w[35], w[36], offset);
      w[36] = hc_bytealign_S (w[34], w[35], offset);
      w[35] = hc_bytealign_S (w[33], w[34], offset);
      w[34] = hc_bytealign_S (w[32], w[33], offset);
      w[33] = hc_bytealign_S (w[31], w[32], offset);
      w[32] = hc_bytealign_S (w[30], w[31], offset);
      w[31] = hc_bytealign_S (w[29], w[30], offset);
      w[30] = hc_bytealign_S (w[28], w[29], offset);
      w[29] = hc_bytealign_S (w[27], w[28], offset);
      w[28] = hc_bytealign_S (w[26], w[27], offset);
      w[27] = hc_bytealign_S (w[25], w[26], offset);
      w[26] = hc_bytealign_S (w[24], w[25], offset);
      w[25] = hc_bytealign_S (w[23], w[24], offset);
      w[24] = hc_bytealign_S (w[22], w[23], offset);
      w[23] = hc_bytealign_S (w[21], w[22], offset);
      w[22] = hc_bytealign_S (w[20], w[21], offset);
      w[21] = hc_bytealign_S (w[19], w[20], offset);
      w[20] = hc_bytealign_S (w[18], w[19], offset);
      w[19] = hc_bytealign_S (w[17], w[18], offset);
      w[18] = hc_bytealign_S (w[16], w[17], offset);
      w[17] = hc_bytealign_S (w[15], w[16], offset);
      w[16] = hc_bytealign_S (w[14], w[15], offset);
      w[15] = hc_bytealign_S (w[13], w[14], offset);
      w[14] = hc_bytealign_S (w[12], w[13], offset);
      w[13] = hc_bytealign_S (w[11], w[12], offset);
      w[12] = hc_bytealign_S (w[10], w[11], offset);
      w[11] = hc_bytealign_S (w[ 9], w[10], offset);
      w[10] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign_S (w[60], w[61], offset);
      w[62] = hc_bytealign_S (w[59], w[60], offset);
      w[61] = hc_bytealign_S (w[58], w[59], offset);
      w[60] = hc_bytealign_S (w[57], w[58], offset);
      w[59] = hc_bytealign_S (w[56], w[57], offset);
      w[58] = hc_bytealign_S (w[55], w[56], offset);
      w[57] = hc_bytealign_S (w[54], w[55], offset);
      w[56] = hc_bytealign_S (w[53], w[54], offset);
      w[55] = hc_bytealign_S (w[52], w[53], offset);
      w[54] = hc_bytealign_S (w[51], w[52], offset);
      w[53] = hc_bytealign_S (w[50], w[51], offset);
      w[52] = hc_bytealign_S (w[49], w[50], offset);
      w[51] = hc_bytealign_S (w[48], w[49], offset);
      w[50] = hc_bytealign_S (w[47], w[48], offset);
      w[49] = hc_bytealign_S (w[46], w[47], offset);
      w[48] = hc_bytealign_S (w[45], w[46], offset);
      w[47] = hc_bytealign_S (w[44], w[45], offset);
      w[46] = hc_bytealign_S (w[43], w[44], offset);
      w[45] = hc_bytealign_S (w[42], w[43], offset);
      w[44] = hc_bytealign_S (w[41], w[42], offset);
      w[43] = hc_bytealign_S (w[40], w[41], offset);
      w[42] = hc_bytealign_S (w[39], w[40], offset);
      w[41] = hc_bytealign_S (w[38], w[39], offset);
      w[40] = hc_bytealign_S (w[37], w[38], offset);
      w[39] = hc_bytealign_S (w[36], w[37], offset);
      w[38] = hc_bytealign_S (w[35], w[36], offset);
      w[37] = hc_bytealign_S (w[34], w[35], offset);
      w[36] = hc_bytealign_S (w[33], w[34], offset);
      w[35] = hc_bytealign_S (w[32], w[33], offset);
      w[34] = hc_bytealign_S (w[31], w[32], offset);
      w[33] = hc_bytealign_S (w[30], w[31], offset);
      w[32] = hc_bytealign_S (w[29], w[30], offset);
      w[31] = hc_bytealign_S (w[28], w[29], offset);
      w[30] = hc_bytealign_S (w[27], w[28], offset);
      w[29] = hc_bytealign_S (w[26], w[27], offset);
      w[28] = hc_bytealign_S (w[25], w[26], offset);
      w[27] = hc_bytealign_S (w[24], w[25], offset);
      w[26] = hc_bytealign_S (w[23], w[24], offset);
      w[25] = hc_bytealign_S (w[22], w[23], offset);
      w[24] = hc_bytealign_S (w[21], w[22], offset);
      w[23] = hc_bytealign_S (w[20], w[21], offset);
      w[22] = hc_bytealign_S (w[19], w[20], offset);
      w[21] = hc_bytealign_S (w[18], w[19], offset);
      w[20] = hc_bytealign_S (w[17], w[18], offset);
      w[19] = hc_bytealign_S (w[16], w[17], offset);
      w[18] = hc_bytealign_S (w[15], w[16], offset);
      w[17] = hc_bytealign_S (w[14], w[15], offset);
      w[16] = hc_bytealign_S (w[13], w[14], offset);
      w[15] = hc_bytealign_S (w[12], w[13], offset);
      w[14] = hc_bytealign_S (w[11], w[12], offset);
      w[13] = hc_bytealign_S (w[10], w[11], offset);
      w[12] = hc_bytealign_S (w[ 9], w[10], offset);
      w[11] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign_S (w[59], w[60], offset);
      w[62] = hc_bytealign_S (w[58], w[59], offset);
      w[61] = hc_bytealign_S (w[57], w[58], offset);
      w[60] = hc_bytealign_S (w[56], w[57], offset);
      w[59] = hc_bytealign_S (w[55], w[56], offset);
      w[58] = hc_bytealign_S (w[54], w[55], offset);
      w[57] = hc_bytealign_S (w[53], w[54], offset);
      w[56] = hc_bytealign_S (w[52], w[53], offset);
      w[55] = hc_bytealign_S (w[51], w[52], offset);
      w[54] = hc_bytealign_S (w[50], w[51], offset);
      w[53] = hc_bytealign_S (w[49], w[50], offset);
      w[52] = hc_bytealign_S (w[48], w[49], offset);
      w[51] = hc_bytealign_S (w[47], w[48], offset);
      w[50] = hc_bytealign_S (w[46], w[47], offset);
      w[49] = hc_bytealign_S (w[45], w[46], offset);
      w[48] = hc_bytealign_S (w[44], w[45], offset);
      w[47] = hc_bytealign_S (w[43], w[44], offset);
      w[46] = hc_bytealign_S (w[42], w[43], offset);
      w[45] = hc_bytealign_S (w[41], w[42], offset);
      w[44] = hc_bytealign_S (w[40], w[41], offset);
      w[43] = hc_bytealign_S (w[39], w[40], offset);
      w[42] = hc_bytealign_S (w[38], w[39], offset);
      w[41] = hc_bytealign_S (w[37], w[38], offset);
      w[40] = hc_bytealign_S (w[36], w[37], offset);
      w[39] = hc_bytealign_S (w[35], w[36], offset);
      w[38] = hc_bytealign_S (w[34], w[35], offset);
      w[37] = hc_bytealign_S (w[33], w[34], offset);
      w[36] = hc_bytealign_S (w[32], w[33], offset);
      w[35] = hc_bytealign_S (w[31], w[32], offset);
      w[34] = hc_bytealign_S (w[30], w[31], offset);
      w[33] = hc_bytealign_S (w[29], w[30], offset);
      w[32] = hc_bytealign_S (w[28], w[29], offset);
      w[31] = hc_bytealign_S (w[27], w[28], offset);
      w[30] = hc_bytealign_S (w[26], w[27], offset);
      w[29] = hc_bytealign_S (w[25], w[26], offset);
      w[28] = hc_bytealign_S (w[24], w[25], offset);
      w[27] = hc_bytealign_S (w[23], w[24], offset);
      w[26] = hc_bytealign_S (w[22], w[23], offset);
      w[25] = hc_bytealign_S (w[21], w[22], offset);
      w[24] = hc_bytealign_S (w[20], w[21], offset);
      w[23] = hc_bytealign_S (w[19], w[20], offset);
      w[22] = hc_bytealign_S (w[18], w[19], offset);
      w[21] = hc_bytealign_S (w[17], w[18], offset);
      w[20] = hc_bytealign_S (w[16], w[17], offset);
      w[19] = hc_bytealign_S (w[15], w[16], offset);
      w[18] = hc_bytealign_S (w[14], w[15], offset);
      w[17] = hc_bytealign_S (w[13], w[14], offset);
      w[16] = hc_bytealign_S (w[12], w[13], offset);
      w[15] = hc_bytealign_S (w[11], w[12], offset);
      w[14] = hc_bytealign_S (w[10], w[11], offset);
      w[13] = hc_bytealign_S (w[ 9], w[10], offset);
      w[12] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign_S (w[58], w[59], offset);
      w[62] = hc_bytealign_S (w[57], w[58], offset);
      w[61] = hc_bytealign_S (w[56], w[57], offset);
      w[60] = hc_bytealign_S (w[55], w[56], offset);
      w[59] = hc_bytealign_S (w[54], w[55], offset);
      w[58] = hc_bytealign_S (w[53], w[54], offset);
      w[57] = hc_bytealign_S (w[52], w[53], offset);
      w[56] = hc_bytealign_S (w[51], w[52], offset);
      w[55] = hc_bytealign_S (w[50], w[51], offset);
      w[54] = hc_bytealign_S (w[49], w[50], offset);
      w[53] = hc_bytealign_S (w[48], w[49], offset);
      w[52] = hc_bytealign_S (w[47], w[48], offset);
      w[51] = hc_bytealign_S (w[46], w[47], offset);
      w[50] = hc_bytealign_S (w[45], w[46], offset);
      w[49] = hc_bytealign_S (w[44], w[45], offset);
      w[48] = hc_bytealign_S (w[43], w[44], offset);
      w[47] = hc_bytealign_S (w[42], w[43], offset);
      w[46] = hc_bytealign_S (w[41], w[42], offset);
      w[45] = hc_bytealign_S (w[40], w[41], offset);
      w[44] = hc_bytealign_S (w[39], w[40], offset);
      w[43] = hc_bytealign_S (w[38], w[39], offset);
      w[42] = hc_bytealign_S (w[37], w[38], offset);
      w[41] = hc_bytealign_S (w[36], w[37], offset);
      w[40] = hc_bytealign_S (w[35], w[36], offset);
      w[39] = hc_bytealign_S (w[34], w[35], offset);
      w[38] = hc_bytealign_S (w[33], w[34], offset);
      w[37] = hc_bytealign_S (w[32], w[33], offset);
      w[36] = hc_bytealign_S (w[31], w[32], offset);
      w[35] = hc_bytealign_S (w[30], w[31], offset);
      w[34] = hc_bytealign_S (w[29], w[30], offset);
      w[33] = hc_bytealign_S (w[28], w[29], offset);
      w[32] = hc_bytealign_S (w[27], w[28], offset);
      w[31] = hc_bytealign_S (w[26], w[27], offset);
      w[30] = hc_bytealign_S (w[25], w[26], offset);
      w[29] = hc_bytealign_S (w[24], w[25], offset);
      w[28] = hc_bytealign_S (w[23], w[24], offset);
      w[27] = hc_bytealign_S (w[22], w[23], offset);
      w[26] = hc_bytealign_S (w[21], w[22], offset);
      w[25] = hc_bytealign_S (w[20], w[21], offset);
      w[24] = hc_bytealign_S (w[19], w[20], offset);
      w[23] = hc_bytealign_S (w[18], w[19], offset);
      w[22] = hc_bytealign_S (w[17], w[18], offset);
      w[21] = hc_bytealign_S (w[16], w[17], offset);
      w[20] = hc_bytealign_S (w[15], w[16], offset);
      w[19] = hc_bytealign_S (w[14], w[15], offset);
      w[18] = hc_bytealign_S (w[13], w[14], offset);
      w[17] = hc_bytealign_S (w[12], w[13], offset);
      w[16] = hc_bytealign_S (w[11], w[12], offset);
      w[15] = hc_bytealign_S (w[10], w[11], offset);
      w[14] = hc_bytealign_S (w[ 9], w[10], offset);
      w[13] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign_S (w[57], w[58], offset);
      w[62] = hc_bytealign_S (w[56], w[57], offset);
      w[61] = hc_bytealign_S (w[55], w[56], offset);
      w[60] = hc_bytealign_S (w[54], w[55], offset);
      w[59] = hc_bytealign_S (w[53], w[54], offset);
      w[58] = hc_bytealign_S (w[52], w[53], offset);
      w[57] = hc_bytealign_S (w[51], w[52], offset);
      w[56] = hc_bytealign_S (w[50], w[51], offset);
      w[55] = hc_bytealign_S (w[49], w[50], offset);
      w[54] = hc_bytealign_S (w[48], w[49], offset);
      w[53] = hc_bytealign_S (w[47], w[48], offset);
      w[52] = hc_bytealign_S (w[46], w[47], offset);
      w[51] = hc_bytealign_S (w[45], w[46], offset);
      w[50] = hc_bytealign_S (w[44], w[45], offset);
      w[49] = hc_bytealign_S (w[43], w[44], offset);
      w[48] = hc_bytealign_S (w[42], w[43], offset);
      w[47] = hc_bytealign_S (w[41], w[42], offset);
      w[46] = hc_bytealign_S (w[40], w[41], offset);
      w[45] = hc_bytealign_S (w[39], w[40], offset);
      w[44] = hc_bytealign_S (w[38], w[39], offset);
      w[43] = hc_bytealign_S (w[37], w[38], offset);
      w[42] = hc_bytealign_S (w[36], w[37], offset);
      w[41] = hc_bytealign_S (w[35], w[36], offset);
      w[40] = hc_bytealign_S (w[34], w[35], offset);
      w[39] = hc_bytealign_S (w[33], w[34], offset);
      w[38] = hc_bytealign_S (w[32], w[33], offset);
      w[37] = hc_bytealign_S (w[31], w[32], offset);
      w[36] = hc_bytealign_S (w[30], w[31], offset);
      w[35] = hc_bytealign_S (w[29], w[30], offset);
      w[34] = hc_bytealign_S (w[28], w[29], offset);
      w[33] = hc_bytealign_S (w[27], w[28], offset);
      w[32] = hc_bytealign_S (w[26], w[27], offset);
      w[31] = hc_bytealign_S (w[25], w[26], offset);
      w[30] = hc_bytealign_S (w[24], w[25], offset);
      w[29] = hc_bytealign_S (w[23], w[24], offset);
      w[28] = hc_bytealign_S (w[22], w[23], offset);
      w[27] = hc_bytealign_S (w[21], w[22], offset);
      w[26] = hc_bytealign_S (w[20], w[21], offset);
      w[25] = hc_bytealign_S (w[19], w[20], offset);
      w[24] = hc_bytealign_S (w[18], w[19], offset);
      w[23] = hc_bytealign_S (w[17], w[18], offset);
      w[22] = hc_bytealign_S (w[16], w[17], offset);
      w[21] = hc_bytealign_S (w[15], w[16], offset);
      w[20] = hc_bytealign_S (w[14], w[15], offset);
      w[19] = hc_bytealign_S (w[13], w[14], offset);
      w[18] = hc_bytealign_S (w[12], w[13], offset);
      w[17] = hc_bytealign_S (w[11], w[12], offset);
      w[16] = hc_bytealign_S (w[10], w[11], offset);
      w[15] = hc_bytealign_S (w[ 9], w[10], offset);
      w[14] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign_S (w[56], w[57], offset);
      w[62] = hc_bytealign_S (w[55], w[56], offset);
      w[61] = hc_bytealign_S (w[54], w[55], offset);
      w[60] = hc_bytealign_S (w[53], w[54], offset);
      w[59] = hc_bytealign_S (w[52], w[53], offset);
      w[58] = hc_bytealign_S (w[51], w[52], offset);
      w[57] = hc_bytealign_S (w[50], w[51], offset);
      w[56] = hc_bytealign_S (w[49], w[50], offset);
      w[55] = hc_bytealign_S (w[48], w[49], offset);
      w[54] = hc_bytealign_S (w[47], w[48], offset);
      w[53] = hc_bytealign_S (w[46], w[47], offset);
      w[52] = hc_bytealign_S (w[45], w[46], offset);
      w[51] = hc_bytealign_S (w[44], w[45], offset);
      w[50] = hc_bytealign_S (w[43], w[44], offset);
      w[49] = hc_bytealign_S (w[42], w[43], offset);
      w[48] = hc_bytealign_S (w[41], w[42], offset);
      w[47] = hc_bytealign_S (w[40], w[41], offset);
      w[46] = hc_bytealign_S (w[39], w[40], offset);
      w[45] = hc_bytealign_S (w[38], w[39], offset);
      w[44] = hc_bytealign_S (w[37], w[38], offset);
      w[43] = hc_bytealign_S (w[36], w[37], offset);
      w[42] = hc_bytealign_S (w[35], w[36], offset);
      w[41] = hc_bytealign_S (w[34], w[35], offset);
      w[40] = hc_bytealign_S (w[33], w[34], offset);
      w[39] = hc_bytealign_S (w[32], w[33], offset);
      w[38] = hc_bytealign_S (w[31], w[32], offset);
      w[37] = hc_bytealign_S (w[30], w[31], offset);
      w[36] = hc_bytealign_S (w[29], w[30], offset);
      w[35] = hc_bytealign_S (w[28], w[29], offset);
      w[34] = hc_bytealign_S (w[27], w[28], offset);
      w[33] = hc_bytealign_S (w[26], w[27], offset);
      w[32] = hc_bytealign_S (w[25], w[26], offset);
      w[31] = hc_bytealign_S (w[24], w[25], offset);
      w[30] = hc_bytealign_S (w[23], w[24], offset);
      w[29] = hc_bytealign_S (w[22], w[23], offset);
      w[28] = hc_bytealign_S (w[21], w[22], offset);
      w[27] = hc_bytealign_S (w[20], w[21], offset);
      w[26] = hc_bytealign_S (w[19], w[20], offset);
      w[25] = hc_bytealign_S (w[18], w[19], offset);
      w[24] = hc_bytealign_S (w[17], w[18], offset);
      w[23] = hc_bytealign_S (w[16], w[17], offset);
      w[22] = hc_bytealign_S (w[15], w[16], offset);
      w[21] = hc_bytealign_S (w[14], w[15], offset);
      w[20] = hc_bytealign_S (w[13], w[14], offset);
      w[19] = hc_bytealign_S (w[12], w[13], offset);
      w[18] = hc_bytealign_S (w[11], w[12], offset);
      w[17] = hc_bytealign_S (w[10], w[11], offset);
      w[16] = hc_bytealign_S (w[ 9], w[10], offset);
      w[15] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign_S (w[55], w[56], offset);
      w[62] = hc_bytealign_S (w[54], w[55], offset);
      w[61] = hc_bytealign_S (w[53], w[54], offset);
      w[60] = hc_bytealign_S (w[52], w[53], offset);
      w[59] = hc_bytealign_S (w[51], w[52], offset);
      w[58] = hc_bytealign_S (w[50], w[51], offset);
      w[57] = hc_bytealign_S (w[49], w[50], offset);
      w[56] = hc_bytealign_S (w[48], w[49], offset);
      w[55] = hc_bytealign_S (w[47], w[48], offset);
      w[54] = hc_bytealign_S (w[46], w[47], offset);
      w[53] = hc_bytealign_S (w[45], w[46], offset);
      w[52] = hc_bytealign_S (w[44], w[45], offset);
      w[51] = hc_bytealign_S (w[43], w[44], offset);
      w[50] = hc_bytealign_S (w[42], w[43], offset);
      w[49] = hc_bytealign_S (w[41], w[42], offset);
      w[48] = hc_bytealign_S (w[40], w[41], offset);
      w[47] = hc_bytealign_S (w[39], w[40], offset);
      w[46] = hc_bytealign_S (w[38], w[39], offset);
      w[45] = hc_bytealign_S (w[37], w[38], offset);
      w[44] = hc_bytealign_S (w[36], w[37], offset);
      w[43] = hc_bytealign_S (w[35], w[36], offset);
      w[42] = hc_bytealign_S (w[34], w[35], offset);
      w[41] = hc_bytealign_S (w[33], w[34], offset);
      w[40] = hc_bytealign_S (w[32], w[33], offset);
      w[39] = hc_bytealign_S (w[31], w[32], offset);
      w[38] = hc_bytealign_S (w[30], w[31], offset);
      w[37] = hc_bytealign_S (w[29], w[30], offset);
      w[36] = hc_bytealign_S (w[28], w[29], offset);
      w[35] = hc_bytealign_S (w[27], w[28], offset);
      w[34] = hc_bytealign_S (w[26], w[27], offset);
      w[33] = hc_bytealign_S (w[25], w[26], offset);
      w[32] = hc_bytealign_S (w[24], w[25], offset);
      w[31] = hc_bytealign_S (w[23], w[24], offset);
      w[30] = hc_bytealign_S (w[22], w[23], offset);
      w[29] = hc_bytealign_S (w[21], w[22], offset);
      w[28] = hc_bytealign_S (w[20], w[21], offset);
      w[27] = hc_bytealign_S (w[19], w[20], offset);
      w[26] = hc_bytealign_S (w[18], w[19], offset);
      w[25] = hc_bytealign_S (w[17], w[18], offset);
      w[24] = hc_bytealign_S (w[16], w[17], offset);
      w[23] = hc_bytealign_S (w[15], w[16], offset);
      w[22] = hc_bytealign_S (w[14], w[15], offset);
      w[21] = hc_bytealign_S (w[13], w[14], offset);
      w[20] = hc_bytealign_S (w[12], w[13], offset);
      w[19] = hc_bytealign_S (w[11], w[12], offset);
      w[18] = hc_bytealign_S (w[10], w[11], offset);
      w[17] = hc_bytealign_S (w[ 9], w[10], offset);
      w[16] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign_S (w[54], w[55], offset);
      w[62] = hc_bytealign_S (w[53], w[54], offset);
      w[61] = hc_bytealign_S (w[52], w[53], offset);
      w[60] = hc_bytealign_S (w[51], w[52], offset);
      w[59] = hc_bytealign_S (w[50], w[51], offset);
      w[58] = hc_bytealign_S (w[49], w[50], offset);
      w[57] = hc_bytealign_S (w[48], w[49], offset);
      w[56] = hc_bytealign_S (w[47], w[48], offset);
      w[55] = hc_bytealign_S (w[46], w[47], offset);
      w[54] = hc_bytealign_S (w[45], w[46], offset);
      w[53] = hc_bytealign_S (w[44], w[45], offset);
      w[52] = hc_bytealign_S (w[43], w[44], offset);
      w[51] = hc_bytealign_S (w[42], w[43], offset);
      w[50] = hc_bytealign_S (w[41], w[42], offset);
      w[49] = hc_bytealign_S (w[40], w[41], offset);
      w[48] = hc_bytealign_S (w[39], w[40], offset);
      w[47] = hc_bytealign_S (w[38], w[39], offset);
      w[46] = hc_bytealign_S (w[37], w[38], offset);
      w[45] = hc_bytealign_S (w[36], w[37], offset);
      w[44] = hc_bytealign_S (w[35], w[36], offset);
      w[43] = hc_bytealign_S (w[34], w[35], offset);
      w[42] = hc_bytealign_S (w[33], w[34], offset);
      w[41] = hc_bytealign_S (w[32], w[33], offset);
      w[40] = hc_bytealign_S (w[31], w[32], offset);
      w[39] = hc_bytealign_S (w[30], w[31], offset);
      w[38] = hc_bytealign_S (w[29], w[30], offset);
      w[37] = hc_bytealign_S (w[28], w[29], offset);
      w[36] = hc_bytealign_S (w[27], w[28], offset);
      w[35] = hc_bytealign_S (w[26], w[27], offset);
      w[34] = hc_bytealign_S (w[25], w[26], offset);
      w[33] = hc_bytealign_S (w[24], w[25], offset);
      w[32] = hc_bytealign_S (w[23], w[24], offset);
      w[31] = hc_bytealign_S (w[22], w[23], offset);
      w[30] = hc_bytealign_S (w[21], w[22], offset);
      w[29] = hc_bytealign_S (w[20], w[21], offset);
      w[28] = hc_bytealign_S (w[19], w[20], offset);
      w[27] = hc_bytealign_S (w[18], w[19], offset);
      w[26] = hc_bytealign_S (w[17], w[18], offset);
      w[25] = hc_bytealign_S (w[16], w[17], offset);
      w[24] = hc_bytealign_S (w[15], w[16], offset);
      w[23] = hc_bytealign_S (w[14], w[15], offset);
      w[22] = hc_bytealign_S (w[13], w[14], offset);
      w[21] = hc_bytealign_S (w[12], w[13], offset);
      w[20] = hc_bytealign_S (w[11], w[12], offset);
      w[19] = hc_bytealign_S (w[10], w[11], offset);
      w[18] = hc_bytealign_S (w[ 9], w[10], offset);
      w[17] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign_S (w[53], w[54], offset);
      w[62] = hc_bytealign_S (w[52], w[53], offset);
      w[61] = hc_bytealign_S (w[51], w[52], offset);
      w[60] = hc_bytealign_S (w[50], w[51], offset);
      w[59] = hc_bytealign_S (w[49], w[50], offset);
      w[58] = hc_bytealign_S (w[48], w[49], offset);
      w[57] = hc_bytealign_S (w[47], w[48], offset);
      w[56] = hc_bytealign_S (w[46], w[47], offset);
      w[55] = hc_bytealign_S (w[45], w[46], offset);
      w[54] = hc_bytealign_S (w[44], w[45], offset);
      w[53] = hc_bytealign_S (w[43], w[44], offset);
      w[52] = hc_bytealign_S (w[42], w[43], offset);
      w[51] = hc_bytealign_S (w[41], w[42], offset);
      w[50] = hc_bytealign_S (w[40], w[41], offset);
      w[49] = hc_bytealign_S (w[39], w[40], offset);
      w[48] = hc_bytealign_S (w[38], w[39], offset);
      w[47] = hc_bytealign_S (w[37], w[38], offset);
      w[46] = hc_bytealign_S (w[36], w[37], offset);
      w[45] = hc_bytealign_S (w[35], w[36], offset);
      w[44] = hc_bytealign_S (w[34], w[35], offset);
      w[43] = hc_bytealign_S (w[33], w[34], offset);
      w[42] = hc_bytealign_S (w[32], w[33], offset);
      w[41] = hc_bytealign_S (w[31], w[32], offset);
      w[40] = hc_bytealign_S (w[30], w[31], offset);
      w[39] = hc_bytealign_S (w[29], w[30], offset);
      w[38] = hc_bytealign_S (w[28], w[29], offset);
      w[37] = hc_bytealign_S (w[27], w[28], offset);
      w[36] = hc_bytealign_S (w[26], w[27], offset);
      w[35] = hc_bytealign_S (w[25], w[26], offset);
      w[34] = hc_bytealign_S (w[24], w[25], offset);
      w[33] = hc_bytealign_S (w[23], w[24], offset);
      w[32] = hc_bytealign_S (w[22], w[23], offset);
      w[31] = hc_bytealign_S (w[21], w[22], offset);
      w[30] = hc_bytealign_S (w[20], w[21], offset);
      w[29] = hc_bytealign_S (w[19], w[20], offset);
      w[28] = hc_bytealign_S (w[18], w[19], offset);
      w[27] = hc_bytealign_S (w[17], w[18], offset);
      w[26] = hc_bytealign_S (w[16], w[17], offset);
      w[25] = hc_bytealign_S (w[15], w[16], offset);
      w[24] = hc_bytealign_S (w[14], w[15], offset);
      w[23] = hc_bytealign_S (w[13], w[14], offset);
      w[22] = hc_bytealign_S (w[12], w[13], offset);
      w[21] = hc_bytealign_S (w[11], w[12], offset);
      w[20] = hc_bytealign_S (w[10], w[11], offset);
      w[19] = hc_bytealign_S (w[ 9], w[10], offset);
      w[18] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign_S (w[52], w[53], offset);
      w[62] = hc_bytealign_S (w[51], w[52], offset);
      w[61] = hc_bytealign_S (w[50], w[51], offset);
      w[60] = hc_bytealign_S (w[49], w[50], offset);
      w[59] = hc_bytealign_S (w[48], w[49], offset);
      w[58] = hc_bytealign_S (w[47], w[48], offset);
      w[57] = hc_bytealign_S (w[46], w[47], offset);
      w[56] = hc_bytealign_S (w[45], w[46], offset);
      w[55] = hc_bytealign_S (w[44], w[45], offset);
      w[54] = hc_bytealign_S (w[43], w[44], offset);
      w[53] = hc_bytealign_S (w[42], w[43], offset);
      w[52] = hc_bytealign_S (w[41], w[42], offset);
      w[51] = hc_bytealign_S (w[40], w[41], offset);
      w[50] = hc_bytealign_S (w[39], w[40], offset);
      w[49] = hc_bytealign_S (w[38], w[39], offset);
      w[48] = hc_bytealign_S (w[37], w[38], offset);
      w[47] = hc_bytealign_S (w[36], w[37], offset);
      w[46] = hc_bytealign_S (w[35], w[36], offset);
      w[45] = hc_bytealign_S (w[34], w[35], offset);
      w[44] = hc_bytealign_S (w[33], w[34], offset);
      w[43] = hc_bytealign_S (w[32], w[33], offset);
      w[42] = hc_bytealign_S (w[31], w[32], offset);
      w[41] = hc_bytealign_S (w[30], w[31], offset);
      w[40] = hc_bytealign_S (w[29], w[30], offset);
      w[39] = hc_bytealign_S (w[28], w[29], offset);
      w[38] = hc_bytealign_S (w[27], w[28], offset);
      w[37] = hc_bytealign_S (w[26], w[27], offset);
      w[36] = hc_bytealign_S (w[25], w[26], offset);
      w[35] = hc_bytealign_S (w[24], w[25], offset);
      w[34] = hc_bytealign_S (w[23], w[24], offset);
      w[33] = hc_bytealign_S (w[22], w[23], offset);
      w[32] = hc_bytealign_S (w[21], w[22], offset);
      w[31] = hc_bytealign_S (w[20], w[21], offset);
      w[30] = hc_bytealign_S (w[19], w[20], offset);
      w[29] = hc_bytealign_S (w[18], w[19], offset);
      w[28] = hc_bytealign_S (w[17], w[18], offset);
      w[27] = hc_bytealign_S (w[16], w[17], offset);
      w[26] = hc_bytealign_S (w[15], w[16], offset);
      w[25] = hc_bytealign_S (w[14], w[15], offset);
      w[24] = hc_bytealign_S (w[13], w[14], offset);
      w[23] = hc_bytealign_S (w[12], w[13], offset);
      w[22] = hc_bytealign_S (w[11], w[12], offset);
      w[21] = hc_bytealign_S (w[10], w[11], offset);
      w[20] = hc_bytealign_S (w[ 9], w[10], offset);
      w[19] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign_S (w[51], w[52], offset);
      w[62] = hc_bytealign_S (w[50], w[51], offset);
      w[61] = hc_bytealign_S (w[49], w[50], offset);
      w[60] = hc_bytealign_S (w[48], w[49], offset);
      w[59] = hc_bytealign_S (w[47], w[48], offset);
      w[58] = hc_bytealign_S (w[46], w[47], offset);
      w[57] = hc_bytealign_S (w[45], w[46], offset);
      w[56] = hc_bytealign_S (w[44], w[45], offset);
      w[55] = hc_bytealign_S (w[43], w[44], offset);
      w[54] = hc_bytealign_S (w[42], w[43], offset);
      w[53] = hc_bytealign_S (w[41], w[42], offset);
      w[52] = hc_bytealign_S (w[40], w[41], offset);
      w[51] = hc_bytealign_S (w[39], w[40], offset);
      w[50] = hc_bytealign_S (w[38], w[39], offset);
      w[49] = hc_bytealign_S (w[37], w[38], offset);
      w[48] = hc_bytealign_S (w[36], w[37], offset);
      w[47] = hc_bytealign_S (w[35], w[36], offset);
      w[46] = hc_bytealign_S (w[34], w[35], offset);
      w[45] = hc_bytealign_S (w[33], w[34], offset);
      w[44] = hc_bytealign_S (w[32], w[33], offset);
      w[43] = hc_bytealign_S (w[31], w[32], offset);
      w[42] = hc_bytealign_S (w[30], w[31], offset);
      w[41] = hc_bytealign_S (w[29], w[30], offset);
      w[40] = hc_bytealign_S (w[28], w[29], offset);
      w[39] = hc_bytealign_S (w[27], w[28], offset);
      w[38] = hc_bytealign_S (w[26], w[27], offset);
      w[37] = hc_bytealign_S (w[25], w[26], offset);
      w[36] = hc_bytealign_S (w[24], w[25], offset);
      w[35] = hc_bytealign_S (w[23], w[24], offset);
      w[34] = hc_bytealign_S (w[22], w[23], offset);
      w[33] = hc_bytealign_S (w[21], w[22], offset);
      w[32] = hc_bytealign_S (w[20], w[21], offset);
      w[31] = hc_bytealign_S (w[19], w[20], offset);
      w[30] = hc_bytealign_S (w[18], w[19], offset);
      w[29] = hc_bytealign_S (w[17], w[18], offset);
      w[28] = hc_bytealign_S (w[16], w[17], offset);
      w[27] = hc_bytealign_S (w[15], w[16], offset);
      w[26] = hc_bytealign_S (w[14], w[15], offset);
      w[25] = hc_bytealign_S (w[13], w[14], offset);
      w[24] = hc_bytealign_S (w[12], w[13], offset);
      w[23] = hc_bytealign_S (w[11], w[12], offset);
      w[22] = hc_bytealign_S (w[10], w[11], offset);
      w[21] = hc_bytealign_S (w[ 9], w[10], offset);
      w[20] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign_S (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign_S (w[50], w[51], offset);
      w[62] = hc_bytealign_S (w[49], w[50], offset);
      w[61] = hc_bytealign_S (w[48], w[49], offset);
      w[60] = hc_bytealign_S (w[47], w[48], offset);
      w[59] = hc_bytealign_S (w[46], w[47], offset);
      w[58] = hc_bytealign_S (w[45], w[46], offset);
      w[57] = hc_bytealign_S (w[44], w[45], offset);
      w[56] = hc_bytealign_S (w[43], w[44], offset);
      w[55] = hc_bytealign_S (w[42], w[43], offset);
      w[54] = hc_bytealign_S (w[41], w[42], offset);
      w[53] = hc_bytealign_S (w[40], w[41], offset);
      w[52] = hc_bytealign_S (w[39], w[40], offset);
      w[51] = hc_bytealign_S (w[38], w[39], offset);
      w[50] = hc_bytealign_S (w[37], w[38], offset);
      w[49] = hc_bytealign_S (w[36], w[37], offset);
      w[48] = hc_bytealign_S (w[35], w[36], offset);
      w[47] = hc_bytealign_S (w[34], w[35], offset);
      w[46] = hc_bytealign_S (w[33], w[34], offset);
      w[45] = hc_bytealign_S (w[32], w[33], offset);
      w[44] = hc_bytealign_S (w[31], w[32], offset);
      w[43] = hc_bytealign_S (w[30], w[31], offset);
      w[42] = hc_bytealign_S (w[29], w[30], offset);
      w[41] = hc_bytealign_S (w[28], w[29], offset);
      w[40] = hc_bytealign_S (w[27], w[28], offset);
      w[39] = hc_bytealign_S (w[26], w[27], offset);
      w[38] = hc_bytealign_S (w[25], w[26], offset);
      w[37] = hc_bytealign_S (w[24], w[25], offset);
      w[36] = hc_bytealign_S (w[23], w[24], offset);
      w[35] = hc_bytealign_S (w[22], w[23], offset);
      w[34] = hc_bytealign_S (w[21], w[22], offset);
      w[33] = hc_bytealign_S (w[20], w[21], offset);
      w[32] = hc_bytealign_S (w[19], w[20], offset);
      w[31] = hc_bytealign_S (w[18], w[19], offset);
      w[30] = hc_bytealign_S (w[17], w[18], offset);
      w[29] = hc_bytealign_S (w[16], w[17], offset);
      w[28] = hc_bytealign_S (w[15], w[16], offset);
      w[27] = hc_bytealign_S (w[14], w[15], offset);
      w[26] = hc_bytealign_S (w[13], w[14], offset);
      w[25] = hc_bytealign_S (w[12], w[13], offset);
      w[24] = hc_bytealign_S (w[11], w[12], offset);
      w[23] = hc_bytealign_S (w[10], w[11], offset);
      w[22] = hc_bytealign_S (w[ 9], w[10], offset);
      w[21] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign_S (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign_S (w[49], w[50], offset);
      w[62] = hc_bytealign_S (w[48], w[49], offset);
      w[61] = hc_bytealign_S (w[47], w[48], offset);
      w[60] = hc_bytealign_S (w[46], w[47], offset);
      w[59] = hc_bytealign_S (w[45], w[46], offset);
      w[58] = hc_bytealign_S (w[44], w[45], offset);
      w[57] = hc_bytealign_S (w[43], w[44], offset);
      w[56] = hc_bytealign_S (w[42], w[43], offset);
      w[55] = hc_bytealign_S (w[41], w[42], offset);
      w[54] = hc_bytealign_S (w[40], w[41], offset);
      w[53] = hc_bytealign_S (w[39], w[40], offset);
      w[52] = hc_bytealign_S (w[38], w[39], offset);
      w[51] = hc_bytealign_S (w[37], w[38], offset);
      w[50] = hc_bytealign_S (w[36], w[37], offset);
      w[49] = hc_bytealign_S (w[35], w[36], offset);
      w[48] = hc_bytealign_S (w[34], w[35], offset);
      w[47] = hc_bytealign_S (w[33], w[34], offset);
      w[46] = hc_bytealign_S (w[32], w[33], offset);
      w[45] = hc_bytealign_S (w[31], w[32], offset);
      w[44] = hc_bytealign_S (w[30], w[31], offset);
      w[43] = hc_bytealign_S (w[29], w[30], offset);
      w[42] = hc_bytealign_S (w[28], w[29], offset);
      w[41] = hc_bytealign_S (w[27], w[28], offset);
      w[40] = hc_bytealign_S (w[26], w[27], offset);
      w[39] = hc_bytealign_S (w[25], w[26], offset);
      w[38] = hc_bytealign_S (w[24], w[25], offset);
      w[37] = hc_bytealign_S (w[23], w[24], offset);
      w[36] = hc_bytealign_S (w[22], w[23], offset);
      w[35] = hc_bytealign_S (w[21], w[22], offset);
      w[34] = hc_bytealign_S (w[20], w[21], offset);
      w[33] = hc_bytealign_S (w[19], w[20], offset);
      w[32] = hc_bytealign_S (w[18], w[19], offset);
      w[31] = hc_bytealign_S (w[17], w[18], offset);
      w[30] = hc_bytealign_S (w[16], w[17], offset);
      w[29] = hc_bytealign_S (w[15], w[16], offset);
      w[28] = hc_bytealign_S (w[14], w[15], offset);
      w[27] = hc_bytealign_S (w[13], w[14], offset);
      w[26] = hc_bytealign_S (w[12], w[13], offset);
      w[25] = hc_bytealign_S (w[11], w[12], offset);
      w[24] = hc_bytealign_S (w[10], w[11], offset);
      w[23] = hc_bytealign_S (w[ 9], w[10], offset);
      w[22] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign_S (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign_S (w[48], w[49], offset);
      w[62] = hc_bytealign_S (w[47], w[48], offset);
      w[61] = hc_bytealign_S (w[46], w[47], offset);
      w[60] = hc_bytealign_S (w[45], w[46], offset);
      w[59] = hc_bytealign_S (w[44], w[45], offset);
      w[58] = hc_bytealign_S (w[43], w[44], offset);
      w[57] = hc_bytealign_S (w[42], w[43], offset);
      w[56] = hc_bytealign_S (w[41], w[42], offset);
      w[55] = hc_bytealign_S (w[40], w[41], offset);
      w[54] = hc_bytealign_S (w[39], w[40], offset);
      w[53] = hc_bytealign_S (w[38], w[39], offset);
      w[52] = hc_bytealign_S (w[37], w[38], offset);
      w[51] = hc_bytealign_S (w[36], w[37], offset);
      w[50] = hc_bytealign_S (w[35], w[36], offset);
      w[49] = hc_bytealign_S (w[34], w[35], offset);
      w[48] = hc_bytealign_S (w[33], w[34], offset);
      w[47] = hc_bytealign_S (w[32], w[33], offset);
      w[46] = hc_bytealign_S (w[31], w[32], offset);
      w[45] = hc_bytealign_S (w[30], w[31], offset);
      w[44] = hc_bytealign_S (w[29], w[30], offset);
      w[43] = hc_bytealign_S (w[28], w[29], offset);
      w[42] = hc_bytealign_S (w[27], w[28], offset);
      w[41] = hc_bytealign_S (w[26], w[27], offset);
      w[40] = hc_bytealign_S (w[25], w[26], offset);
      w[39] = hc_bytealign_S (w[24], w[25], offset);
      w[38] = hc_bytealign_S (w[23], w[24], offset);
      w[37] = hc_bytealign_S (w[22], w[23], offset);
      w[36] = hc_bytealign_S (w[21], w[22], offset);
      w[35] = hc_bytealign_S (w[20], w[21], offset);
      w[34] = hc_bytealign_S (w[19], w[20], offset);
      w[33] = hc_bytealign_S (w[18], w[19], offset);
      w[32] = hc_bytealign_S (w[17], w[18], offset);
      w[31] = hc_bytealign_S (w[16], w[17], offset);
      w[30] = hc_bytealign_S (w[15], w[16], offset);
      w[29] = hc_bytealign_S (w[14], w[15], offset);
      w[28] = hc_bytealign_S (w[13], w[14], offset);
      w[27] = hc_bytealign_S (w[12], w[13], offset);
      w[26] = hc_bytealign_S (w[11], w[12], offset);
      w[25] = hc_bytealign_S (w[10], w[11], offset);
      w[24] = hc_bytealign_S (w[ 9], w[10], offset);
      w[23] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign_S (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign_S (w[47], w[48], offset);
      w[62] = hc_bytealign_S (w[46], w[47], offset);
      w[61] = hc_bytealign_S (w[45], w[46], offset);
      w[60] = hc_bytealign_S (w[44], w[45], offset);
      w[59] = hc_bytealign_S (w[43], w[44], offset);
      w[58] = hc_bytealign_S (w[42], w[43], offset);
      w[57] = hc_bytealign_S (w[41], w[42], offset);
      w[56] = hc_bytealign_S (w[40], w[41], offset);
      w[55] = hc_bytealign_S (w[39], w[40], offset);
      w[54] = hc_bytealign_S (w[38], w[39], offset);
      w[53] = hc_bytealign_S (w[37], w[38], offset);
      w[52] = hc_bytealign_S (w[36], w[37], offset);
      w[51] = hc_bytealign_S (w[35], w[36], offset);
      w[50] = hc_bytealign_S (w[34], w[35], offset);
      w[49] = hc_bytealign_S (w[33], w[34], offset);
      w[48] = hc_bytealign_S (w[32], w[33], offset);
      w[47] = hc_bytealign_S (w[31], w[32], offset);
      w[46] = hc_bytealign_S (w[30], w[31], offset);
      w[45] = hc_bytealign_S (w[29], w[30], offset);
      w[44] = hc_bytealign_S (w[28], w[29], offset);
      w[43] = hc_bytealign_S (w[27], w[28], offset);
      w[42] = hc_bytealign_S (w[26], w[27], offset);
      w[41] = hc_bytealign_S (w[25], w[26], offset);
      w[40] = hc_bytealign_S (w[24], w[25], offset);
      w[39] = hc_bytealign_S (w[23], w[24], offset);
      w[38] = hc_bytealign_S (w[22], w[23], offset);
      w[37] = hc_bytealign_S (w[21], w[22], offset);
      w[36] = hc_bytealign_S (w[20], w[21], offset);
      w[35] = hc_bytealign_S (w[19], w[20], offset);
      w[34] = hc_bytealign_S (w[18], w[19], offset);
      w[33] = hc_bytealign_S (w[17], w[18], offset);
      w[32] = hc_bytealign_S (w[16], w[17], offset);
      w[31] = hc_bytealign_S (w[15], w[16], offset);
      w[30] = hc_bytealign_S (w[14], w[15], offset);
      w[29] = hc_bytealign_S (w[13], w[14], offset);
      w[28] = hc_bytealign_S (w[12], w[13], offset);
      w[27] = hc_bytealign_S (w[11], w[12], offset);
      w[26] = hc_bytealign_S (w[10], w[11], offset);
      w[25] = hc_bytealign_S (w[ 9], w[10], offset);
      w[24] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign_S (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign_S (w[46], w[47], offset);
      w[62] = hc_bytealign_S (w[45], w[46], offset);
      w[61] = hc_bytealign_S (w[44], w[45], offset);
      w[60] = hc_bytealign_S (w[43], w[44], offset);
      w[59] = hc_bytealign_S (w[42], w[43], offset);
      w[58] = hc_bytealign_S (w[41], w[42], offset);
      w[57] = hc_bytealign_S (w[40], w[41], offset);
      w[56] = hc_bytealign_S (w[39], w[40], offset);
      w[55] = hc_bytealign_S (w[38], w[39], offset);
      w[54] = hc_bytealign_S (w[37], w[38], offset);
      w[53] = hc_bytealign_S (w[36], w[37], offset);
      w[52] = hc_bytealign_S (w[35], w[36], offset);
      w[51] = hc_bytealign_S (w[34], w[35], offset);
      w[50] = hc_bytealign_S (w[33], w[34], offset);
      w[49] = hc_bytealign_S (w[32], w[33], offset);
      w[48] = hc_bytealign_S (w[31], w[32], offset);
      w[47] = hc_bytealign_S (w[30], w[31], offset);
      w[46] = hc_bytealign_S (w[29], w[30], offset);
      w[45] = hc_bytealign_S (w[28], w[29], offset);
      w[44] = hc_bytealign_S (w[27], w[28], offset);
      w[43] = hc_bytealign_S (w[26], w[27], offset);
      w[42] = hc_bytealign_S (w[25], w[26], offset);
      w[41] = hc_bytealign_S (w[24], w[25], offset);
      w[40] = hc_bytealign_S (w[23], w[24], offset);
      w[39] = hc_bytealign_S (w[22], w[23], offset);
      w[38] = hc_bytealign_S (w[21], w[22], offset);
      w[37] = hc_bytealign_S (w[20], w[21], offset);
      w[36] = hc_bytealign_S (w[19], w[20], offset);
      w[35] = hc_bytealign_S (w[18], w[19], offset);
      w[34] = hc_bytealign_S (w[17], w[18], offset);
      w[33] = hc_bytealign_S (w[16], w[17], offset);
      w[32] = hc_bytealign_S (w[15], w[16], offset);
      w[31] = hc_bytealign_S (w[14], w[15], offset);
      w[30] = hc_bytealign_S (w[13], w[14], offset);
      w[29] = hc_bytealign_S (w[12], w[13], offset);
      w[28] = hc_bytealign_S (w[11], w[12], offset);
      w[27] = hc_bytealign_S (w[10], w[11], offset);
      w[26] = hc_bytealign_S (w[ 9], w[10], offset);
      w[25] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign_S (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign_S (w[45], w[46], offset);
      w[62] = hc_bytealign_S (w[44], w[45], offset);
      w[61] = hc_bytealign_S (w[43], w[44], offset);
      w[60] = hc_bytealign_S (w[42], w[43], offset);
      w[59] = hc_bytealign_S (w[41], w[42], offset);
      w[58] = hc_bytealign_S (w[40], w[41], offset);
      w[57] = hc_bytealign_S (w[39], w[40], offset);
      w[56] = hc_bytealign_S (w[38], w[39], offset);
      w[55] = hc_bytealign_S (w[37], w[38], offset);
      w[54] = hc_bytealign_S (w[36], w[37], offset);
      w[53] = hc_bytealign_S (w[35], w[36], offset);
      w[52] = hc_bytealign_S (w[34], w[35], offset);
      w[51] = hc_bytealign_S (w[33], w[34], offset);
      w[50] = hc_bytealign_S (w[32], w[33], offset);
      w[49] = hc_bytealign_S (w[31], w[32], offset);
      w[48] = hc_bytealign_S (w[30], w[31], offset);
      w[47] = hc_bytealign_S (w[29], w[30], offset);
      w[46] = hc_bytealign_S (w[28], w[29], offset);
      w[45] = hc_bytealign_S (w[27], w[28], offset);
      w[44] = hc_bytealign_S (w[26], w[27], offset);
      w[43] = hc_bytealign_S (w[25], w[26], offset);
      w[42] = hc_bytealign_S (w[24], w[25], offset);
      w[41] = hc_bytealign_S (w[23], w[24], offset);
      w[40] = hc_bytealign_S (w[22], w[23], offset);
      w[39] = hc_bytealign_S (w[21], w[22], offset);
      w[38] = hc_bytealign_S (w[20], w[21], offset);
      w[37] = hc_bytealign_S (w[19], w[20], offset);
      w[36] = hc_bytealign_S (w[18], w[19], offset);
      w[35] = hc_bytealign_S (w[17], w[18], offset);
      w[34] = hc_bytealign_S (w[16], w[17], offset);
      w[33] = hc_bytealign_S (w[15], w[16], offset);
      w[32] = hc_bytealign_S (w[14], w[15], offset);
      w[31] = hc_bytealign_S (w[13], w[14], offset);
      w[30] = hc_bytealign_S (w[12], w[13], offset);
      w[29] = hc_bytealign_S (w[11], w[12], offset);
      w[28] = hc_bytealign_S (w[10], w[11], offset);
      w[27] = hc_bytealign_S (w[ 9], w[10], offset);
      w[26] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign_S (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign_S (w[44], w[45], offset);
      w[62] = hc_bytealign_S (w[43], w[44], offset);
      w[61] = hc_bytealign_S (w[42], w[43], offset);
      w[60] = hc_bytealign_S (w[41], w[42], offset);
      w[59] = hc_bytealign_S (w[40], w[41], offset);
      w[58] = hc_bytealign_S (w[39], w[40], offset);
      w[57] = hc_bytealign_S (w[38], w[39], offset);
      w[56] = hc_bytealign_S (w[37], w[38], offset);
      w[55] = hc_bytealign_S (w[36], w[37], offset);
      w[54] = hc_bytealign_S (w[35], w[36], offset);
      w[53] = hc_bytealign_S (w[34], w[35], offset);
      w[52] = hc_bytealign_S (w[33], w[34], offset);
      w[51] = hc_bytealign_S (w[32], w[33], offset);
      w[50] = hc_bytealign_S (w[31], w[32], offset);
      w[49] = hc_bytealign_S (w[30], w[31], offset);
      w[48] = hc_bytealign_S (w[29], w[30], offset);
      w[47] = hc_bytealign_S (w[28], w[29], offset);
      w[46] = hc_bytealign_S (w[27], w[28], offset);
      w[45] = hc_bytealign_S (w[26], w[27], offset);
      w[44] = hc_bytealign_S (w[25], w[26], offset);
      w[43] = hc_bytealign_S (w[24], w[25], offset);
      w[42] = hc_bytealign_S (w[23], w[24], offset);
      w[41] = hc_bytealign_S (w[22], w[23], offset);
      w[40] = hc_bytealign_S (w[21], w[22], offset);
      w[39] = hc_bytealign_S (w[20], w[21], offset);
      w[38] = hc_bytealign_S (w[19], w[20], offset);
      w[37] = hc_bytealign_S (w[18], w[19], offset);
      w[36] = hc_bytealign_S (w[17], w[18], offset);
      w[35] = hc_bytealign_S (w[16], w[17], offset);
      w[34] = hc_bytealign_S (w[15], w[16], offset);
      w[33] = hc_bytealign_S (w[14], w[15], offset);
      w[32] = hc_bytealign_S (w[13], w[14], offset);
      w[31] = hc_bytealign_S (w[12], w[13], offset);
      w[30] = hc_bytealign_S (w[11], w[12], offset);
      w[29] = hc_bytealign_S (w[10], w[11], offset);
      w[28] = hc_bytealign_S (w[ 9], w[10], offset);
      w[27] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign_S (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign_S (w[43], w[44], offset);
      w[62] = hc_bytealign_S (w[42], w[43], offset);
      w[61] = hc_bytealign_S (w[41], w[42], offset);
      w[60] = hc_bytealign_S (w[40], w[41], offset);
      w[59] = hc_bytealign_S (w[39], w[40], offset);
      w[58] = hc_bytealign_S (w[38], w[39], offset);
      w[57] = hc_bytealign_S (w[37], w[38], offset);
      w[56] = hc_bytealign_S (w[36], w[37], offset);
      w[55] = hc_bytealign_S (w[35], w[36], offset);
      w[54] = hc_bytealign_S (w[34], w[35], offset);
      w[53] = hc_bytealign_S (w[33], w[34], offset);
      w[52] = hc_bytealign_S (w[32], w[33], offset);
      w[51] = hc_bytealign_S (w[31], w[32], offset);
      w[50] = hc_bytealign_S (w[30], w[31], offset);
      w[49] = hc_bytealign_S (w[29], w[30], offset);
      w[48] = hc_bytealign_S (w[28], w[29], offset);
      w[47] = hc_bytealign_S (w[27], w[28], offset);
      w[46] = hc_bytealign_S (w[26], w[27], offset);
      w[45] = hc_bytealign_S (w[25], w[26], offset);
      w[44] = hc_bytealign_S (w[24], w[25], offset);
      w[43] = hc_bytealign_S (w[23], w[24], offset);
      w[42] = hc_bytealign_S (w[22], w[23], offset);
      w[41] = hc_bytealign_S (w[21], w[22], offset);
      w[40] = hc_bytealign_S (w[20], w[21], offset);
      w[39] = hc_bytealign_S (w[19], w[20], offset);
      w[38] = hc_bytealign_S (w[18], w[19], offset);
      w[37] = hc_bytealign_S (w[17], w[18], offset);
      w[36] = hc_bytealign_S (w[16], w[17], offset);
      w[35] = hc_bytealign_S (w[15], w[16], offset);
      w[34] = hc_bytealign_S (w[14], w[15], offset);
      w[33] = hc_bytealign_S (w[13], w[14], offset);
      w[32] = hc_bytealign_S (w[12], w[13], offset);
      w[31] = hc_bytealign_S (w[11], w[12], offset);
      w[30] = hc_bytealign_S (w[10], w[11], offset);
      w[29] = hc_bytealign_S (w[ 9], w[10], offset);
      w[28] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign_S (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign_S (w[42], w[43], offset);
      w[62] = hc_bytealign_S (w[41], w[42], offset);
      w[61] = hc_bytealign_S (w[40], w[41], offset);
      w[60] = hc_bytealign_S (w[39], w[40], offset);
      w[59] = hc_bytealign_S (w[38], w[39], offset);
      w[58] = hc_bytealign_S (w[37], w[38], offset);
      w[57] = hc_bytealign_S (w[36], w[37], offset);
      w[56] = hc_bytealign_S (w[35], w[36], offset);
      w[55] = hc_bytealign_S (w[34], w[35], offset);
      w[54] = hc_bytealign_S (w[33], w[34], offset);
      w[53] = hc_bytealign_S (w[32], w[33], offset);
      w[52] = hc_bytealign_S (w[31], w[32], offset);
      w[51] = hc_bytealign_S (w[30], w[31], offset);
      w[50] = hc_bytealign_S (w[29], w[30], offset);
      w[49] = hc_bytealign_S (w[28], w[29], offset);
      w[48] = hc_bytealign_S (w[27], w[28], offset);
      w[47] = hc_bytealign_S (w[26], w[27], offset);
      w[46] = hc_bytealign_S (w[25], w[26], offset);
      w[45] = hc_bytealign_S (w[24], w[25], offset);
      w[44] = hc_bytealign_S (w[23], w[24], offset);
      w[43] = hc_bytealign_S (w[22], w[23], offset);
      w[42] = hc_bytealign_S (w[21], w[22], offset);
      w[41] = hc_bytealign_S (w[20], w[21], offset);
      w[40] = hc_bytealign_S (w[19], w[20], offset);
      w[39] = hc_bytealign_S (w[18], w[19], offset);
      w[38] = hc_bytealign_S (w[17], w[18], offset);
      w[37] = hc_bytealign_S (w[16], w[17], offset);
      w[36] = hc_bytealign_S (w[15], w[16], offset);
      w[35] = hc_bytealign_S (w[14], w[15], offset);
      w[34] = hc_bytealign_S (w[13], w[14], offset);
      w[33] = hc_bytealign_S (w[12], w[13], offset);
      w[32] = hc_bytealign_S (w[11], w[12], offset);
      w[31] = hc_bytealign_S (w[10], w[11], offset);
      w[30] = hc_bytealign_S (w[ 9], w[10], offset);
      w[29] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign_S (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign_S (w[41], w[42], offset);
      w[62] = hc_bytealign_S (w[40], w[41], offset);
      w[61] = hc_bytealign_S (w[39], w[40], offset);
      w[60] = hc_bytealign_S (w[38], w[39], offset);
      w[59] = hc_bytealign_S (w[37], w[38], offset);
      w[58] = hc_bytealign_S (w[36], w[37], offset);
      w[57] = hc_bytealign_S (w[35], w[36], offset);
      w[56] = hc_bytealign_S (w[34], w[35], offset);
      w[55] = hc_bytealign_S (w[33], w[34], offset);
      w[54] = hc_bytealign_S (w[32], w[33], offset);
      w[53] = hc_bytealign_S (w[31], w[32], offset);
      w[52] = hc_bytealign_S (w[30], w[31], offset);
      w[51] = hc_bytealign_S (w[29], w[30], offset);
      w[50] = hc_bytealign_S (w[28], w[29], offset);
      w[49] = hc_bytealign_S (w[27], w[28], offset);
      w[48] = hc_bytealign_S (w[26], w[27], offset);
      w[47] = hc_bytealign_S (w[25], w[26], offset);
      w[46] = hc_bytealign_S (w[24], w[25], offset);
      w[45] = hc_bytealign_S (w[23], w[24], offset);
      w[44] = hc_bytealign_S (w[22], w[23], offset);
      w[43] = hc_bytealign_S (w[21], w[22], offset);
      w[42] = hc_bytealign_S (w[20], w[21], offset);
      w[41] = hc_bytealign_S (w[19], w[20], offset);
      w[40] = hc_bytealign_S (w[18], w[19], offset);
      w[39] = hc_bytealign_S (w[17], w[18], offset);
      w[38] = hc_bytealign_S (w[16], w[17], offset);
      w[37] = hc_bytealign_S (w[15], w[16], offset);
      w[36] = hc_bytealign_S (w[14], w[15], offset);
      w[35] = hc_bytealign_S (w[13], w[14], offset);
      w[34] = hc_bytealign_S (w[12], w[13], offset);
      w[33] = hc_bytealign_S (w[11], w[12], offset);
      w[32] = hc_bytealign_S (w[10], w[11], offset);
      w[31] = hc_bytealign_S (w[ 9], w[10], offset);
      w[30] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign_S (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign_S (w[40], w[41], offset);
      w[62] = hc_bytealign_S (w[39], w[40], offset);
      w[61] = hc_bytealign_S (w[38], w[39], offset);
      w[60] = hc_bytealign_S (w[37], w[38], offset);
      w[59] = hc_bytealign_S (w[36], w[37], offset);
      w[58] = hc_bytealign_S (w[35], w[36], offset);
      w[57] = hc_bytealign_S (w[34], w[35], offset);
      w[56] = hc_bytealign_S (w[33], w[34], offset);
      w[55] = hc_bytealign_S (w[32], w[33], offset);
      w[54] = hc_bytealign_S (w[31], w[32], offset);
      w[53] = hc_bytealign_S (w[30], w[31], offset);
      w[52] = hc_bytealign_S (w[29], w[30], offset);
      w[51] = hc_bytealign_S (w[28], w[29], offset);
      w[50] = hc_bytealign_S (w[27], w[28], offset);
      w[49] = hc_bytealign_S (w[26], w[27], offset);
      w[48] = hc_bytealign_S (w[25], w[26], offset);
      w[47] = hc_bytealign_S (w[24], w[25], offset);
      w[46] = hc_bytealign_S (w[23], w[24], offset);
      w[45] = hc_bytealign_S (w[22], w[23], offset);
      w[44] = hc_bytealign_S (w[21], w[22], offset);
      w[43] = hc_bytealign_S (w[20], w[21], offset);
      w[42] = hc_bytealign_S (w[19], w[20], offset);
      w[41] = hc_bytealign_S (w[18], w[19], offset);
      w[40] = hc_bytealign_S (w[17], w[18], offset);
      w[39] = hc_bytealign_S (w[16], w[17], offset);
      w[38] = hc_bytealign_S (w[15], w[16], offset);
      w[37] = hc_bytealign_S (w[14], w[15], offset);
      w[36] = hc_bytealign_S (w[13], w[14], offset);
      w[35] = hc_bytealign_S (w[12], w[13], offset);
      w[34] = hc_bytealign_S (w[11], w[12], offset);
      w[33] = hc_bytealign_S (w[10], w[11], offset);
      w[32] = hc_bytealign_S (w[ 9], w[10], offset);
      w[31] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign_S (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign_S (w[39], w[40], offset);
      w[62] = hc_bytealign_S (w[38], w[39], offset);
      w[61] = hc_bytealign_S (w[37], w[38], offset);
      w[60] = hc_bytealign_S (w[36], w[37], offset);
      w[59] = hc_bytealign_S (w[35], w[36], offset);
      w[58] = hc_bytealign_S (w[34], w[35], offset);
      w[57] = hc_bytealign_S (w[33], w[34], offset);
      w[56] = hc_bytealign_S (w[32], w[33], offset);
      w[55] = hc_bytealign_S (w[31], w[32], offset);
      w[54] = hc_bytealign_S (w[30], w[31], offset);
      w[53] = hc_bytealign_S (w[29], w[30], offset);
      w[52] = hc_bytealign_S (w[28], w[29], offset);
      w[51] = hc_bytealign_S (w[27], w[28], offset);
      w[50] = hc_bytealign_S (w[26], w[27], offset);
      w[49] = hc_bytealign_S (w[25], w[26], offset);
      w[48] = hc_bytealign_S (w[24], w[25], offset);
      w[47] = hc_bytealign_S (w[23], w[24], offset);
      w[46] = hc_bytealign_S (w[22], w[23], offset);
      w[45] = hc_bytealign_S (w[21], w[22], offset);
      w[44] = hc_bytealign_S (w[20], w[21], offset);
      w[43] = hc_bytealign_S (w[19], w[20], offset);
      w[42] = hc_bytealign_S (w[18], w[19], offset);
      w[41] = hc_bytealign_S (w[17], w[18], offset);
      w[40] = hc_bytealign_S (w[16], w[17], offset);
      w[39] = hc_bytealign_S (w[15], w[16], offset);
      w[38] = hc_bytealign_S (w[14], w[15], offset);
      w[37] = hc_bytealign_S (w[13], w[14], offset);
      w[36] = hc_bytealign_S (w[12], w[13], offset);
      w[35] = hc_bytealign_S (w[11], w[12], offset);
      w[34] = hc_bytealign_S (w[10], w[11], offset);
      w[33] = hc_bytealign_S (w[ 9], w[10], offset);
      w[32] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign_S (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign_S (w[38], w[39], offset);
      w[62] = hc_bytealign_S (w[37], w[38], offset);
      w[61] = hc_bytealign_S (w[36], w[37], offset);
      w[60] = hc_bytealign_S (w[35], w[36], offset);
      w[59] = hc_bytealign_S (w[34], w[35], offset);
      w[58] = hc_bytealign_S (w[33], w[34], offset);
      w[57] = hc_bytealign_S (w[32], w[33], offset);
      w[56] = hc_bytealign_S (w[31], w[32], offset);
      w[55] = hc_bytealign_S (w[30], w[31], offset);
      w[54] = hc_bytealign_S (w[29], w[30], offset);
      w[53] = hc_bytealign_S (w[28], w[29], offset);
      w[52] = hc_bytealign_S (w[27], w[28], offset);
      w[51] = hc_bytealign_S (w[26], w[27], offset);
      w[50] = hc_bytealign_S (w[25], w[26], offset);
      w[49] = hc_bytealign_S (w[24], w[25], offset);
      w[48] = hc_bytealign_S (w[23], w[24], offset);
      w[47] = hc_bytealign_S (w[22], w[23], offset);
      w[46] = hc_bytealign_S (w[21], w[22], offset);
      w[45] = hc_bytealign_S (w[20], w[21], offset);
      w[44] = hc_bytealign_S (w[19], w[20], offset);
      w[43] = hc_bytealign_S (w[18], w[19], offset);
      w[42] = hc_bytealign_S (w[17], w[18], offset);
      w[41] = hc_bytealign_S (w[16], w[17], offset);
      w[40] = hc_bytealign_S (w[15], w[16], offset);
      w[39] = hc_bytealign_S (w[14], w[15], offset);
      w[38] = hc_bytealign_S (w[13], w[14], offset);
      w[37] = hc_bytealign_S (w[12], w[13], offset);
      w[36] = hc_bytealign_S (w[11], w[12], offset);
      w[35] = hc_bytealign_S (w[10], w[11], offset);
      w[34] = hc_bytealign_S (w[ 9], w[10], offset);
      w[33] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign_S (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign_S (w[37], w[38], offset);
      w[62] = hc_bytealign_S (w[36], w[37], offset);
      w[61] = hc_bytealign_S (w[35], w[36], offset);
      w[60] = hc_bytealign_S (w[34], w[35], offset);
      w[59] = hc_bytealign_S (w[33], w[34], offset);
      w[58] = hc_bytealign_S (w[32], w[33], offset);
      w[57] = hc_bytealign_S (w[31], w[32], offset);
      w[56] = hc_bytealign_S (w[30], w[31], offset);
      w[55] = hc_bytealign_S (w[29], w[30], offset);
      w[54] = hc_bytealign_S (w[28], w[29], offset);
      w[53] = hc_bytealign_S (w[27], w[28], offset);
      w[52] = hc_bytealign_S (w[26], w[27], offset);
      w[51] = hc_bytealign_S (w[25], w[26], offset);
      w[50] = hc_bytealign_S (w[24], w[25], offset);
      w[49] = hc_bytealign_S (w[23], w[24], offset);
      w[48] = hc_bytealign_S (w[22], w[23], offset);
      w[47] = hc_bytealign_S (w[21], w[22], offset);
      w[46] = hc_bytealign_S (w[20], w[21], offset);
      w[45] = hc_bytealign_S (w[19], w[20], offset);
      w[44] = hc_bytealign_S (w[18], w[19], offset);
      w[43] = hc_bytealign_S (w[17], w[18], offset);
      w[42] = hc_bytealign_S (w[16], w[17], offset);
      w[41] = hc_bytealign_S (w[15], w[16], offset);
      w[40] = hc_bytealign_S (w[14], w[15], offset);
      w[39] = hc_bytealign_S (w[13], w[14], offset);
      w[38] = hc_bytealign_S (w[12], w[13], offset);
      w[37] = hc_bytealign_S (w[11], w[12], offset);
      w[36] = hc_bytealign_S (w[10], w[11], offset);
      w[35] = hc_bytealign_S (w[ 9], w[10], offset);
      w[34] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign_S (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign_S (w[36], w[37], offset);
      w[62] = hc_bytealign_S (w[35], w[36], offset);
      w[61] = hc_bytealign_S (w[34], w[35], offset);
      w[60] = hc_bytealign_S (w[33], w[34], offset);
      w[59] = hc_bytealign_S (w[32], w[33], offset);
      w[58] = hc_bytealign_S (w[31], w[32], offset);
      w[57] = hc_bytealign_S (w[30], w[31], offset);
      w[56] = hc_bytealign_S (w[29], w[30], offset);
      w[55] = hc_bytealign_S (w[28], w[29], offset);
      w[54] = hc_bytealign_S (w[27], w[28], offset);
      w[53] = hc_bytealign_S (w[26], w[27], offset);
      w[52] = hc_bytealign_S (w[25], w[26], offset);
      w[51] = hc_bytealign_S (w[24], w[25], offset);
      w[50] = hc_bytealign_S (w[23], w[24], offset);
      w[49] = hc_bytealign_S (w[22], w[23], offset);
      w[48] = hc_bytealign_S (w[21], w[22], offset);
      w[47] = hc_bytealign_S (w[20], w[21], offset);
      w[46] = hc_bytealign_S (w[19], w[20], offset);
      w[45] = hc_bytealign_S (w[18], w[19], offset);
      w[44] = hc_bytealign_S (w[17], w[18], offset);
      w[43] = hc_bytealign_S (w[16], w[17], offset);
      w[42] = hc_bytealign_S (w[15], w[16], offset);
      w[41] = hc_bytealign_S (w[14], w[15], offset);
      w[40] = hc_bytealign_S (w[13], w[14], offset);
      w[39] = hc_bytealign_S (w[12], w[13], offset);
      w[38] = hc_bytealign_S (w[11], w[12], offset);
      w[37] = hc_bytealign_S (w[10], w[11], offset);
      w[36] = hc_bytealign_S (w[ 9], w[10], offset);
      w[35] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign_S (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign_S (w[35], w[36], offset);
      w[62] = hc_bytealign_S (w[34], w[35], offset);
      w[61] = hc_bytealign_S (w[33], w[34], offset);
      w[60] = hc_bytealign_S (w[32], w[33], offset);
      w[59] = hc_bytealign_S (w[31], w[32], offset);
      w[58] = hc_bytealign_S (w[30], w[31], offset);
      w[57] = hc_bytealign_S (w[29], w[30], offset);
      w[56] = hc_bytealign_S (w[28], w[29], offset);
      w[55] = hc_bytealign_S (w[27], w[28], offset);
      w[54] = hc_bytealign_S (w[26], w[27], offset);
      w[53] = hc_bytealign_S (w[25], w[26], offset);
      w[52] = hc_bytealign_S (w[24], w[25], offset);
      w[51] = hc_bytealign_S (w[23], w[24], offset);
      w[50] = hc_bytealign_S (w[22], w[23], offset);
      w[49] = hc_bytealign_S (w[21], w[22], offset);
      w[48] = hc_bytealign_S (w[20], w[21], offset);
      w[47] = hc_bytealign_S (w[19], w[20], offset);
      w[46] = hc_bytealign_S (w[18], w[19], offset);
      w[45] = hc_bytealign_S (w[17], w[18], offset);
      w[44] = hc_bytealign_S (w[16], w[17], offset);
      w[43] = hc_bytealign_S (w[15], w[16], offset);
      w[42] = hc_bytealign_S (w[14], w[15], offset);
      w[41] = hc_bytealign_S (w[13], w[14], offset);
      w[40] = hc_bytealign_S (w[12], w[13], offset);
      w[39] = hc_bytealign_S (w[11], w[12], offset);
      w[38] = hc_bytealign_S (w[10], w[11], offset);
      w[37] = hc_bytealign_S (w[ 9], w[10], offset);
      w[36] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign_S (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign_S (w[34], w[35], offset);
      w[62] = hc_bytealign_S (w[33], w[34], offset);
      w[61] = hc_bytealign_S (w[32], w[33], offset);
      w[60] = hc_bytealign_S (w[31], w[32], offset);
      w[59] = hc_bytealign_S (w[30], w[31], offset);
      w[58] = hc_bytealign_S (w[29], w[30], offset);
      w[57] = hc_bytealign_S (w[28], w[29], offset);
      w[56] = hc_bytealign_S (w[27], w[28], offset);
      w[55] = hc_bytealign_S (w[26], w[27], offset);
      w[54] = hc_bytealign_S (w[25], w[26], offset);
      w[53] = hc_bytealign_S (w[24], w[25], offset);
      w[52] = hc_bytealign_S (w[23], w[24], offset);
      w[51] = hc_bytealign_S (w[22], w[23], offset);
      w[50] = hc_bytealign_S (w[21], w[22], offset);
      w[49] = hc_bytealign_S (w[20], w[21], offset);
      w[48] = hc_bytealign_S (w[19], w[20], offset);
      w[47] = hc_bytealign_S (w[18], w[19], offset);
      w[46] = hc_bytealign_S (w[17], w[18], offset);
      w[45] = hc_bytealign_S (w[16], w[17], offset);
      w[44] = hc_bytealign_S (w[15], w[16], offset);
      w[43] = hc_bytealign_S (w[14], w[15], offset);
      w[42] = hc_bytealign_S (w[13], w[14], offset);
      w[41] = hc_bytealign_S (w[12], w[13], offset);
      w[40] = hc_bytealign_S (w[11], w[12], offset);
      w[39] = hc_bytealign_S (w[10], w[11], offset);
      w[38] = hc_bytealign_S (w[ 9], w[10], offset);
      w[37] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign_S (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign_S (w[33], w[34], offset);
      w[62] = hc_bytealign_S (w[32], w[33], offset);
      w[61] = hc_bytealign_S (w[31], w[32], offset);
      w[60] = hc_bytealign_S (w[30], w[31], offset);
      w[59] = hc_bytealign_S (w[29], w[30], offset);
      w[58] = hc_bytealign_S (w[28], w[29], offset);
      w[57] = hc_bytealign_S (w[27], w[28], offset);
      w[56] = hc_bytealign_S (w[26], w[27], offset);
      w[55] = hc_bytealign_S (w[25], w[26], offset);
      w[54] = hc_bytealign_S (w[24], w[25], offset);
      w[53] = hc_bytealign_S (w[23], w[24], offset);
      w[52] = hc_bytealign_S (w[22], w[23], offset);
      w[51] = hc_bytealign_S (w[21], w[22], offset);
      w[50] = hc_bytealign_S (w[20], w[21], offset);
      w[49] = hc_bytealign_S (w[19], w[20], offset);
      w[48] = hc_bytealign_S (w[18], w[19], offset);
      w[47] = hc_bytealign_S (w[17], w[18], offset);
      w[46] = hc_bytealign_S (w[16], w[17], offset);
      w[45] = hc_bytealign_S (w[15], w[16], offset);
      w[44] = hc_bytealign_S (w[14], w[15], offset);
      w[43] = hc_bytealign_S (w[13], w[14], offset);
      w[42] = hc_bytealign_S (w[12], w[13], offset);
      w[41] = hc_bytealign_S (w[11], w[12], offset);
      w[40] = hc_bytealign_S (w[10], w[11], offset);
      w[39] = hc_bytealign_S (w[ 9], w[10], offset);
      w[38] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign_S (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign_S (w[32], w[33], offset);
      w[62] = hc_bytealign_S (w[31], w[32], offset);
      w[61] = hc_bytealign_S (w[30], w[31], offset);
      w[60] = hc_bytealign_S (w[29], w[30], offset);
      w[59] = hc_bytealign_S (w[28], w[29], offset);
      w[58] = hc_bytealign_S (w[27], w[28], offset);
      w[57] = hc_bytealign_S (w[26], w[27], offset);
      w[56] = hc_bytealign_S (w[25], w[26], offset);
      w[55] = hc_bytealign_S (w[24], w[25], offset);
      w[54] = hc_bytealign_S (w[23], w[24], offset);
      w[53] = hc_bytealign_S (w[22], w[23], offset);
      w[52] = hc_bytealign_S (w[21], w[22], offset);
      w[51] = hc_bytealign_S (w[20], w[21], offset);
      w[50] = hc_bytealign_S (w[19], w[20], offset);
      w[49] = hc_bytealign_S (w[18], w[19], offset);
      w[48] = hc_bytealign_S (w[17], w[18], offset);
      w[47] = hc_bytealign_S (w[16], w[17], offset);
      w[46] = hc_bytealign_S (w[15], w[16], offset);
      w[45] = hc_bytealign_S (w[14], w[15], offset);
      w[44] = hc_bytealign_S (w[13], w[14], offset);
      w[43] = hc_bytealign_S (w[12], w[13], offset);
      w[42] = hc_bytealign_S (w[11], w[12], offset);
      w[41] = hc_bytealign_S (w[10], w[11], offset);
      w[40] = hc_bytealign_S (w[ 9], w[10], offset);
      w[39] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign_S (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign_S (w[31], w[32], offset);
      w[62] = hc_bytealign_S (w[30], w[31], offset);
      w[61] = hc_bytealign_S (w[29], w[30], offset);
      w[60] = hc_bytealign_S (w[28], w[29], offset);
      w[59] = hc_bytealign_S (w[27], w[28], offset);
      w[58] = hc_bytealign_S (w[26], w[27], offset);
      w[57] = hc_bytealign_S (w[25], w[26], offset);
      w[56] = hc_bytealign_S (w[24], w[25], offset);
      w[55] = hc_bytealign_S (w[23], w[24], offset);
      w[54] = hc_bytealign_S (w[22], w[23], offset);
      w[53] = hc_bytealign_S (w[21], w[22], offset);
      w[52] = hc_bytealign_S (w[20], w[21], offset);
      w[51] = hc_bytealign_S (w[19], w[20], offset);
      w[50] = hc_bytealign_S (w[18], w[19], offset);
      w[49] = hc_bytealign_S (w[17], w[18], offset);
      w[48] = hc_bytealign_S (w[16], w[17], offset);
      w[47] = hc_bytealign_S (w[15], w[16], offset);
      w[46] = hc_bytealign_S (w[14], w[15], offset);
      w[45] = hc_bytealign_S (w[13], w[14], offset);
      w[44] = hc_bytealign_S (w[12], w[13], offset);
      w[43] = hc_bytealign_S (w[11], w[12], offset);
      w[42] = hc_bytealign_S (w[10], w[11], offset);
      w[41] = hc_bytealign_S (w[ 9], w[10], offset);
      w[40] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign_S (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign_S (w[30], w[31], offset);
      w[62] = hc_bytealign_S (w[29], w[30], offset);
      w[61] = hc_bytealign_S (w[28], w[29], offset);
      w[60] = hc_bytealign_S (w[27], w[28], offset);
      w[59] = hc_bytealign_S (w[26], w[27], offset);
      w[58] = hc_bytealign_S (w[25], w[26], offset);
      w[57] = hc_bytealign_S (w[24], w[25], offset);
      w[56] = hc_bytealign_S (w[23], w[24], offset);
      w[55] = hc_bytealign_S (w[22], w[23], offset);
      w[54] = hc_bytealign_S (w[21], w[22], offset);
      w[53] = hc_bytealign_S (w[20], w[21], offset);
      w[52] = hc_bytealign_S (w[19], w[20], offset);
      w[51] = hc_bytealign_S (w[18], w[19], offset);
      w[50] = hc_bytealign_S (w[17], w[18], offset);
      w[49] = hc_bytealign_S (w[16], w[17], offset);
      w[48] = hc_bytealign_S (w[15], w[16], offset);
      w[47] = hc_bytealign_S (w[14], w[15], offset);
      w[46] = hc_bytealign_S (w[13], w[14], offset);
      w[45] = hc_bytealign_S (w[12], w[13], offset);
      w[44] = hc_bytealign_S (w[11], w[12], offset);
      w[43] = hc_bytealign_S (w[10], w[11], offset);
      w[42] = hc_bytealign_S (w[ 9], w[10], offset);
      w[41] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign_S (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign_S (w[29], w[30], offset);
      w[62] = hc_bytealign_S (w[28], w[29], offset);
      w[61] = hc_bytealign_S (w[27], w[28], offset);
      w[60] = hc_bytealign_S (w[26], w[27], offset);
      w[59] = hc_bytealign_S (w[25], w[26], offset);
      w[58] = hc_bytealign_S (w[24], w[25], offset);
      w[57] = hc_bytealign_S (w[23], w[24], offset);
      w[56] = hc_bytealign_S (w[22], w[23], offset);
      w[55] = hc_bytealign_S (w[21], w[22], offset);
      w[54] = hc_bytealign_S (w[20], w[21], offset);
      w[53] = hc_bytealign_S (w[19], w[20], offset);
      w[52] = hc_bytealign_S (w[18], w[19], offset);
      w[51] = hc_bytealign_S (w[17], w[18], offset);
      w[50] = hc_bytealign_S (w[16], w[17], offset);
      w[49] = hc_bytealign_S (w[15], w[16], offset);
      w[48] = hc_bytealign_S (w[14], w[15], offset);
      w[47] = hc_bytealign_S (w[13], w[14], offset);
      w[46] = hc_bytealign_S (w[12], w[13], offset);
      w[45] = hc_bytealign_S (w[11], w[12], offset);
      w[44] = hc_bytealign_S (w[10], w[11], offset);
      w[43] = hc_bytealign_S (w[ 9], w[10], offset);
      w[42] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign_S (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign_S (w[28], w[29], offset);
      w[62] = hc_bytealign_S (w[27], w[28], offset);
      w[61] = hc_bytealign_S (w[26], w[27], offset);
      w[60] = hc_bytealign_S (w[25], w[26], offset);
      w[59] = hc_bytealign_S (w[24], w[25], offset);
      w[58] = hc_bytealign_S (w[23], w[24], offset);
      w[57] = hc_bytealign_S (w[22], w[23], offset);
      w[56] = hc_bytealign_S (w[21], w[22], offset);
      w[55] = hc_bytealign_S (w[20], w[21], offset);
      w[54] = hc_bytealign_S (w[19], w[20], offset);
      w[53] = hc_bytealign_S (w[18], w[19], offset);
      w[52] = hc_bytealign_S (w[17], w[18], offset);
      w[51] = hc_bytealign_S (w[16], w[17], offset);
      w[50] = hc_bytealign_S (w[15], w[16], offset);
      w[49] = hc_bytealign_S (w[14], w[15], offset);
      w[48] = hc_bytealign_S (w[13], w[14], offset);
      w[47] = hc_bytealign_S (w[12], w[13], offset);
      w[46] = hc_bytealign_S (w[11], w[12], offset);
      w[45] = hc_bytealign_S (w[10], w[11], offset);
      w[44] = hc_bytealign_S (w[ 9], w[10], offset);
      w[43] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign_S (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign_S (w[27], w[28], offset);
      w[62] = hc_bytealign_S (w[26], w[27], offset);
      w[61] = hc_bytealign_S (w[25], w[26], offset);
      w[60] = hc_bytealign_S (w[24], w[25], offset);
      w[59] = hc_bytealign_S (w[23], w[24], offset);
      w[58] = hc_bytealign_S (w[22], w[23], offset);
      w[57] = hc_bytealign_S (w[21], w[22], offset);
      w[56] = hc_bytealign_S (w[20], w[21], offset);
      w[55] = hc_bytealign_S (w[19], w[20], offset);
      w[54] = hc_bytealign_S (w[18], w[19], offset);
      w[53] = hc_bytealign_S (w[17], w[18], offset);
      w[52] = hc_bytealign_S (w[16], w[17], offset);
      w[51] = hc_bytealign_S (w[15], w[16], offset);
      w[50] = hc_bytealign_S (w[14], w[15], offset);
      w[49] = hc_bytealign_S (w[13], w[14], offset);
      w[48] = hc_bytealign_S (w[12], w[13], offset);
      w[47] = hc_bytealign_S (w[11], w[12], offset);
      w[46] = hc_bytealign_S (w[10], w[11], offset);
      w[45] = hc_bytealign_S (w[ 9], w[10], offset);
      w[44] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign_S (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign_S (w[26], w[27], offset);
      w[62] = hc_bytealign_S (w[25], w[26], offset);
      w[61] = hc_bytealign_S (w[24], w[25], offset);
      w[60] = hc_bytealign_S (w[23], w[24], offset);
      w[59] = hc_bytealign_S (w[22], w[23], offset);
      w[58] = hc_bytealign_S (w[21], w[22], offset);
      w[57] = hc_bytealign_S (w[20], w[21], offset);
      w[56] = hc_bytealign_S (w[19], w[20], offset);
      w[55] = hc_bytealign_S (w[18], w[19], offset);
      w[54] = hc_bytealign_S (w[17], w[18], offset);
      w[53] = hc_bytealign_S (w[16], w[17], offset);
      w[52] = hc_bytealign_S (w[15], w[16], offset);
      w[51] = hc_bytealign_S (w[14], w[15], offset);
      w[50] = hc_bytealign_S (w[13], w[14], offset);
      w[49] = hc_bytealign_S (w[12], w[13], offset);
      w[48] = hc_bytealign_S (w[11], w[12], offset);
      w[47] = hc_bytealign_S (w[10], w[11], offset);
      w[46] = hc_bytealign_S (w[ 9], w[10], offset);
      w[45] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign_S (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign_S (w[25], w[26], offset);
      w[62] = hc_bytealign_S (w[24], w[25], offset);
      w[61] = hc_bytealign_S (w[23], w[24], offset);
      w[60] = hc_bytealign_S (w[22], w[23], offset);
      w[59] = hc_bytealign_S (w[21], w[22], offset);
      w[58] = hc_bytealign_S (w[20], w[21], offset);
      w[57] = hc_bytealign_S (w[19], w[20], offset);
      w[56] = hc_bytealign_S (w[18], w[19], offset);
      w[55] = hc_bytealign_S (w[17], w[18], offset);
      w[54] = hc_bytealign_S (w[16], w[17], offset);
      w[53] = hc_bytealign_S (w[15], w[16], offset);
      w[52] = hc_bytealign_S (w[14], w[15], offset);
      w[51] = hc_bytealign_S (w[13], w[14], offset);
      w[50] = hc_bytealign_S (w[12], w[13], offset);
      w[49] = hc_bytealign_S (w[11], w[12], offset);
      w[48] = hc_bytealign_S (w[10], w[11], offset);
      w[47] = hc_bytealign_S (w[ 9], w[10], offset);
      w[46] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign_S (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign_S (w[24], w[25], offset);
      w[62] = hc_bytealign_S (w[23], w[24], offset);
      w[61] = hc_bytealign_S (w[22], w[23], offset);
      w[60] = hc_bytealign_S (w[21], w[22], offset);
      w[59] = hc_bytealign_S (w[20], w[21], offset);
      w[58] = hc_bytealign_S (w[19], w[20], offset);
      w[57] = hc_bytealign_S (w[18], w[19], offset);
      w[56] = hc_bytealign_S (w[17], w[18], offset);
      w[55] = hc_bytealign_S (w[16], w[17], offset);
      w[54] = hc_bytealign_S (w[15], w[16], offset);
      w[53] = hc_bytealign_S (w[14], w[15], offset);
      w[52] = hc_bytealign_S (w[13], w[14], offset);
      w[51] = hc_bytealign_S (w[12], w[13], offset);
      w[50] = hc_bytealign_S (w[11], w[12], offset);
      w[49] = hc_bytealign_S (w[10], w[11], offset);
      w[48] = hc_bytealign_S (w[ 9], w[10], offset);
      w[47] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign_S (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign_S (w[23], w[24], offset);
      w[62] = hc_bytealign_S (w[22], w[23], offset);
      w[61] = hc_bytealign_S (w[21], w[22], offset);
      w[60] = hc_bytealign_S (w[20], w[21], offset);
      w[59] = hc_bytealign_S (w[19], w[20], offset);
      w[58] = hc_bytealign_S (w[18], w[19], offset);
      w[57] = hc_bytealign_S (w[17], w[18], offset);
      w[56] = hc_bytealign_S (w[16], w[17], offset);
      w[55] = hc_bytealign_S (w[15], w[16], offset);
      w[54] = hc_bytealign_S (w[14], w[15], offset);
      w[53] = hc_bytealign_S (w[13], w[14], offset);
      w[52] = hc_bytealign_S (w[12], w[13], offset);
      w[51] = hc_bytealign_S (w[11], w[12], offset);
      w[50] = hc_bytealign_S (w[10], w[11], offset);
      w[49] = hc_bytealign_S (w[ 9], w[10], offset);
      w[48] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign_S (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign_S (w[22], w[23], offset);
      w[62] = hc_bytealign_S (w[21], w[22], offset);
      w[61] = hc_bytealign_S (w[20], w[21], offset);
      w[60] = hc_bytealign_S (w[19], w[20], offset);
      w[59] = hc_bytealign_S (w[18], w[19], offset);
      w[58] = hc_bytealign_S (w[17], w[18], offset);
      w[57] = hc_bytealign_S (w[16], w[17], offset);
      w[56] = hc_bytealign_S (w[15], w[16], offset);
      w[55] = hc_bytealign_S (w[14], w[15], offset);
      w[54] = hc_bytealign_S (w[13], w[14], offset);
      w[53] = hc_bytealign_S (w[12], w[13], offset);
      w[52] = hc_bytealign_S (w[11], w[12], offset);
      w[51] = hc_bytealign_S (w[10], w[11], offset);
      w[50] = hc_bytealign_S (w[ 9], w[10], offset);
      w[49] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign_S (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign_S (w[21], w[22], offset);
      w[62] = hc_bytealign_S (w[20], w[21], offset);
      w[61] = hc_bytealign_S (w[19], w[20], offset);
      w[60] = hc_bytealign_S (w[18], w[19], offset);
      w[59] = hc_bytealign_S (w[17], w[18], offset);
      w[58] = hc_bytealign_S (w[16], w[17], offset);
      w[57] = hc_bytealign_S (w[15], w[16], offset);
      w[56] = hc_bytealign_S (w[14], w[15], offset);
      w[55] = hc_bytealign_S (w[13], w[14], offset);
      w[54] = hc_bytealign_S (w[12], w[13], offset);
      w[53] = hc_bytealign_S (w[11], w[12], offset);
      w[52] = hc_bytealign_S (w[10], w[11], offset);
      w[51] = hc_bytealign_S (w[ 9], w[10], offset);
      w[50] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign_S (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign_S (w[20], w[21], offset);
      w[62] = hc_bytealign_S (w[19], w[20], offset);
      w[61] = hc_bytealign_S (w[18], w[19], offset);
      w[60] = hc_bytealign_S (w[17], w[18], offset);
      w[59] = hc_bytealign_S (w[16], w[17], offset);
      w[58] = hc_bytealign_S (w[15], w[16], offset);
      w[57] = hc_bytealign_S (w[14], w[15], offset);
      w[56] = hc_bytealign_S (w[13], w[14], offset);
      w[55] = hc_bytealign_S (w[12], w[13], offset);
      w[54] = hc_bytealign_S (w[11], w[12], offset);
      w[53] = hc_bytealign_S (w[10], w[11], offset);
      w[52] = hc_bytealign_S (w[ 9], w[10], offset);
      w[51] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign_S (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign_S (w[19], w[20], offset);
      w[62] = hc_bytealign_S (w[18], w[19], offset);
      w[61] = hc_bytealign_S (w[17], w[18], offset);
      w[60] = hc_bytealign_S (w[16], w[17], offset);
      w[59] = hc_bytealign_S (w[15], w[16], offset);
      w[58] = hc_bytealign_S (w[14], w[15], offset);
      w[57] = hc_bytealign_S (w[13], w[14], offset);
      w[56] = hc_bytealign_S (w[12], w[13], offset);
      w[55] = hc_bytealign_S (w[11], w[12], offset);
      w[54] = hc_bytealign_S (w[10], w[11], offset);
      w[53] = hc_bytealign_S (w[ 9], w[10], offset);
      w[52] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[44] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[43] = hc_bytealign_S (    0, w[ 0], offset);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_bytealign_S (w[18], w[19], offset);
      w[62] = hc_bytealign_S (w[17], w[18], offset);
      w[61] = hc_bytealign_S (w[16], w[17], offset);
      w[60] = hc_bytealign_S (w[15], w[16], offset);
      w[59] = hc_bytealign_S (w[14], w[15], offset);
      w[58] = hc_bytealign_S (w[13], w[14], offset);
      w[57] = hc_bytealign_S (w[12], w[13], offset);
      w[56] = hc_bytealign_S (w[11], w[12], offset);
      w[55] = hc_bytealign_S (w[10], w[11], offset);
      w[54] = hc_bytealign_S (w[ 9], w[10], offset);
      w[53] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[52] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[51] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[50] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[49] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[48] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[47] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[46] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[45] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[44] = hc_bytealign_S (    0, w[ 0], offset);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_bytealign_S (w[17], w[18], offset);
      w[62] = hc_bytealign_S (w[16], w[17], offset);
      w[61] = hc_bytealign_S (w[15], w[16], offset);
      w[60] = hc_bytealign_S (w[14], w[15], offset);
      w[59] = hc_bytealign_S (w[13], w[14], offset);
      w[58] = hc_bytealign_S (w[12], w[13], offset);
      w[57] = hc_bytealign_S (w[11], w[12], offset);
      w[56] = hc_bytealign_S (w[10], w[11], offset);
      w[55] = hc_bytealign_S (w[ 9], w[10], offset);
      w[54] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[53] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[52] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[51] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[50] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[49] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[48] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[47] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[46] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[45] = hc_bytealign_S (    0, w[ 0], offset);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_bytealign_S (w[16], w[17], offset);
      w[62] = hc_bytealign_S (w[15], w[16], offset);
      w[61] = hc_bytealign_S (w[14], w[15], offset);
      w[60] = hc_bytealign_S (w[13], w[14], offset);
      w[59] = hc_bytealign_S (w[12], w[13], offset);
      w[58] = hc_bytealign_S (w[11], w[12], offset);
      w[57] = hc_bytealign_S (w[10], w[11], offset);
      w[56] = hc_bytealign_S (w[ 9], w[10], offset);
      w[55] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[54] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[53] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[52] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[51] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[50] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[49] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[48] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[47] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[46] = hc_bytealign_S (    0, w[ 0], offset);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_bytealign_S (w[15], w[16], offset);
      w[62] = hc_bytealign_S (w[14], w[15], offset);
      w[61] = hc_bytealign_S (w[13], w[14], offset);
      w[60] = hc_bytealign_S (w[12], w[13], offset);
      w[59] = hc_bytealign_S (w[11], w[12], offset);
      w[58] = hc_bytealign_S (w[10], w[11], offset);
      w[57] = hc_bytealign_S (w[ 9], w[10], offset);
      w[56] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[55] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[54] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[53] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[52] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[51] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[50] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[49] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[48] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[47] = hc_bytealign_S (    0, w[ 0], offset);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_bytealign_S (w[14], w[15], offset);
      w[62] = hc_bytealign_S (w[13], w[14], offset);
      w[61] = hc_bytealign_S (w[12], w[13], offset);
      w[60] = hc_bytealign_S (w[11], w[12], offset);
      w[59] = hc_bytealign_S (w[10], w[11], offset);
      w[58] = hc_bytealign_S (w[ 9], w[10], offset);
      w[57] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[56] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[55] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[54] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[53] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[52] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[51] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[50] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[49] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[48] = hc_bytealign_S (    0, w[ 0], offset);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_bytealign_S (w[13], w[14], offset);
      w[62] = hc_bytealign_S (w[12], w[13], offset);
      w[61] = hc_bytealign_S (w[11], w[12], offset);
      w[60] = hc_bytealign_S (w[10], w[11], offset);
      w[59] = hc_bytealign_S (w[ 9], w[10], offset);
      w[58] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[57] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[56] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[55] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[54] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[53] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[52] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[51] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[50] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[49] = hc_bytealign_S (    0, w[ 0], offset);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_bytealign_S (w[12], w[13], offset);
      w[62] = hc_bytealign_S (w[11], w[12], offset);
      w[61] = hc_bytealign_S (w[10], w[11], offset);
      w[60] = hc_bytealign_S (w[ 9], w[10], offset);
      w[59] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[58] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[57] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[56] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[55] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[54] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[53] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[52] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[51] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[50] = hc_bytealign_S (    0, w[ 0], offset);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_bytealign_S (w[11], w[12], offset);
      w[62] = hc_bytealign_S (w[10], w[11], offset);
      w[61] = hc_bytealign_S (w[ 9], w[10], offset);
      w[60] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[59] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[58] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[57] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[56] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[55] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[54] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[53] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[52] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[51] = hc_bytealign_S (    0, w[ 0], offset);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_bytealign_S (w[10], w[11], offset);
      w[62] = hc_bytealign_S (w[ 9], w[10], offset);
      w[61] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[60] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[59] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[58] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[57] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[56] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[55] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[54] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[53] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[52] = hc_bytealign_S (    0, w[ 0], offset);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_bytealign_S (w[ 9], w[10], offset);
      w[62] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[61] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[60] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[59] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[58] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[57] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[56] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[55] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[54] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[53] = hc_bytealign_S (    0, w[ 0], offset);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[62] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[61] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[60] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[59] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[58] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[57] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[56] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[55] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[54] = hc_bytealign_S (    0, w[ 0], offset);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[62] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[61] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[60] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[59] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[58] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[57] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[56] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[55] = hc_bytealign_S (    0, w[ 0], offset);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[62] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[61] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[60] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[59] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[58] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[57] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[56] = hc_bytealign_S (    0, w[ 0], offset);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[62] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[61] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[60] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[59] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[58] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[57] = hc_bytealign_S (    0, w[ 0], offset);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[62] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[61] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[60] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[59] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[58] = hc_bytealign_S (    0, w[ 0], offset);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[62] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[61] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[60] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[59] = hc_bytealign_S (    0, w[ 0], offset);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[62] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[61] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[60] = hc_bytealign_S (    0, w[ 0], offset);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[62] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[61] = hc_bytealign_S (    0, w[ 0], offset);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[62] = hc_bytealign_S (    0, w[ 0], offset);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_bytealign_S (    0, w[ 0], offset);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }

  #pragma unroll
  for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]);

  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_byte_perm_S (w[62], w[63], selector);
      w[62] = hc_byte_perm_S (w[61], w[62], selector);
      w[61] = hc_byte_perm_S (w[60], w[61], selector);
      w[60] = hc_byte_perm_S (w[59], w[60], selector);
      w[59] = hc_byte_perm_S (w[58], w[59], selector);
      w[58] = hc_byte_perm_S (w[57], w[58], selector);
      w[57] = hc_byte_perm_S (w[56], w[57], selector);
      w[56] = hc_byte_perm_S (w[55], w[56], selector);
      w[55] = hc_byte_perm_S (w[54], w[55], selector);
      w[54] = hc_byte_perm_S (w[53], w[54], selector);
      w[53] = hc_byte_perm_S (w[52], w[53], selector);
      w[52] = hc_byte_perm_S (w[51], w[52], selector);
      w[51] = hc_byte_perm_S (w[50], w[51], selector);
      w[50] = hc_byte_perm_S (w[49], w[50], selector);
      w[49] = hc_byte_perm_S (w[48], w[49], selector);
      w[48] = hc_byte_perm_S (w[47], w[48], selector);
      w[47] = hc_byte_perm_S (w[46], w[47], selector);
      w[46] = hc_byte_perm_S (w[45], w[46], selector);
      w[45] = hc_byte_perm_S (w[44], w[45], selector);
      w[44] = hc_byte_perm_S (w[43], w[44], selector);
      w[43] = hc_byte_perm_S (w[42], w[43], selector);
      w[42] = hc_byte_perm_S (w[41], w[42], selector);
      w[41] = hc_byte_perm_S (w[40], w[41], selector);
      w[40] = hc_byte_perm_S (w[39], w[40], selector);
      w[39] = hc_byte_perm_S (w[38], w[39], selector);
      w[38] = hc_byte_perm_S (w[37], w[38], selector);
      w[37] = hc_byte_perm_S (w[36], w[37], selector);
      w[36] = hc_byte_perm_S (w[35], w[36], selector);
      w[35] = hc_byte_perm_S (w[34], w[35], selector);
      w[34] = hc_byte_perm_S (w[33], w[34], selector);
      w[33] = hc_byte_perm_S (w[32], w[33], selector);
      w[32] = hc_byte_perm_S (w[31], w[32], selector);
      w[31] = hc_byte_perm_S (w[30], w[31], selector);
      w[30] = hc_byte_perm_S (w[29], w[30], selector);
      w[29] = hc_byte_perm_S (w[28], w[29], selector);
      w[28] = hc_byte_perm_S (w[27], w[28], selector);
      w[27] = hc_byte_perm_S (w[26], w[27], selector);
      w[26] = hc_byte_perm_S (w[25], w[26], selector);
      w[25] = hc_byte_perm_S (w[24], w[25], selector);
      w[24] = hc_byte_perm_S (w[23], w[24], selector);
      w[23] = hc_byte_perm_S (w[22], w[23], selector);
      w[22] = hc_byte_perm_S (w[21], w[22], selector);
      w[21] = hc_byte_perm_S (w[20], w[21], selector);
      w[20] = hc_byte_perm_S (w[19], w[20], selector);
      w[19] = hc_byte_perm_S (w[18], w[19], selector);
      w[18] = hc_byte_perm_S (w[17], w[18], selector);
      w[17] = hc_byte_perm_S (w[16], w[17], selector);
      w[16] = hc_byte_perm_S (w[15], w[16], selector);
      w[15] = hc_byte_perm_S (w[14], w[15], selector);
      w[14] = hc_byte_perm_S (w[13], w[14], selector);
      w[13] = hc_byte_perm_S (w[12], w[13], selector);
      w[12] = hc_byte_perm_S (w[11], w[12], selector);
      w[11] = hc_byte_perm_S (w[10], w[11], selector);
      w[10] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[ 9] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[ 8] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[ 7] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[ 6] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[ 5] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 4] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 3] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 2] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 1] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 0] = hc_byte_perm_S (    0, w[ 0], selector);

      break;

    case  1:
      w[63] = hc_byte_perm_S (w[61], w[62], selector);
      w[62] = hc_byte_perm_S (w[60], w[61], selector);
      w[61] = hc_byte_perm_S (w[59], w[60], selector);
      w[60] = hc_byte_perm_S (w[58], w[59], selector);
      w[59] = hc_byte_perm_S (w[57], w[58], selector);
      w[58] = hc_byte_perm_S (w[56], w[57], selector);
      w[57] = hc_byte_perm_S (w[55], w[56], selector);
      w[56] = hc_byte_perm_S (w[54], w[55], selector);
      w[55] = hc_byte_perm_S (w[53], w[54], selector);
      w[54] = hc_byte_perm_S (w[52], w[53], selector);
      w[53] = hc_byte_perm_S (w[51], w[52], selector);
      w[52] = hc_byte_perm_S (w[50], w[51], selector);
      w[51] = hc_byte_perm_S (w[49], w[50], selector);
      w[50] = hc_byte_perm_S (w[48], w[49], selector);
      w[49] = hc_byte_perm_S (w[47], w[48], selector);
      w[48] = hc_byte_perm_S (w[46], w[47], selector);
      w[47] = hc_byte_perm_S (w[45], w[46], selector);
      w[46] = hc_byte_perm_S (w[44], w[45], selector);
      w[45] = hc_byte_perm_S (w[43], w[44], selector);
      w[44] = hc_byte_perm_S (w[42], w[43], selector);
      w[43] = hc_byte_perm_S (w[41], w[42], selector);
      w[42] = hc_byte_perm_S (w[40], w[41], selector);
      w[41] = hc_byte_perm_S (w[39], w[40], selector);
      w[40] = hc_byte_perm_S (w[38], w[39], selector);
      w[39] = hc_byte_perm_S (w[37], w[38], selector);
      w[38] = hc_byte_perm_S (w[36], w[37], selector);
      w[37] = hc_byte_perm_S (w[35], w[36], selector);
      w[36] = hc_byte_perm_S (w[34], w[35], selector);
      w[35] = hc_byte_perm_S (w[33], w[34], selector);
      w[34] = hc_byte_perm_S (w[32], w[33], selector);
      w[33] = hc_byte_perm_S (w[31], w[32], selector);
      w[32] = hc_byte_perm_S (w[30], w[31], selector);
      w[31] = hc_byte_perm_S (w[29], w[30], selector);
      w[30] = hc_byte_perm_S (w[28], w[29], selector);
      w[29] = hc_byte_perm_S (w[27], w[28], selector);
      w[28] = hc_byte_perm_S (w[26], w[27], selector);
      w[27] = hc_byte_perm_S (w[25], w[26], selector);
      w[26] = hc_byte_perm_S (w[24], w[25], selector);
      w[25] = hc_byte_perm_S (w[23], w[24], selector);
      w[24] = hc_byte_perm_S (w[22], w[23], selector);
      w[23] = hc_byte_perm_S (w[21], w[22], selector);
      w[22] = hc_byte_perm_S (w[20], w[21], selector);
      w[21] = hc_byte_perm_S (w[19], w[20], selector);
      w[20] = hc_byte_perm_S (w[18], w[19], selector);
      w[19] = hc_byte_perm_S (w[17], w[18], selector);
      w[18] = hc_byte_perm_S (w[16], w[17], selector);
      w[17] = hc_byte_perm_S (w[15], w[16], selector);
      w[16] = hc_byte_perm_S (w[14], w[15], selector);
      w[15] = hc_byte_perm_S (w[13], w[14], selector);
      w[14] = hc_byte_perm_S (w[12], w[13], selector);
      w[13] = hc_byte_perm_S (w[11], w[12], selector);
      w[12] = hc_byte_perm_S (w[10], w[11], selector);
      w[11] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[10] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[ 9] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[ 8] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[ 7] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[ 6] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 5] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 4] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 3] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 2] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 1] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_byte_perm_S (w[60], w[61], selector);
      w[62] = hc_byte_perm_S (w[59], w[60], selector);
      w[61] = hc_byte_perm_S (w[58], w[59], selector);
      w[60] = hc_byte_perm_S (w[57], w[58], selector);
      w[59] = hc_byte_perm_S (w[56], w[57], selector);
      w[58] = hc_byte_perm_S (w[55], w[56], selector);
      w[57] = hc_byte_perm_S (w[54], w[55], selector);
      w[56] = hc_byte_perm_S (w[53], w[54], selector);
      w[55] = hc_byte_perm_S (w[52], w[53], selector);
      w[54] = hc_byte_perm_S (w[51], w[52], selector);
      w[53] = hc_byte_perm_S (w[50], w[51], selector);
      w[52] = hc_byte_perm_S (w[49], w[50], selector);
      w[51] = hc_byte_perm_S (w[48], w[49], selector);
      w[50] = hc_byte_perm_S (w[47], w[48], selector);
      w[49] = hc_byte_perm_S (w[46], w[47], selector);
      w[48] = hc_byte_perm_S (w[45], w[46], selector);
      w[47] = hc_byte_perm_S (w[44], w[45], selector);
      w[46] = hc_byte_perm_S (w[43], w[44], selector);
      w[45] = hc_byte_perm_S (w[42], w[43], selector);
      w[44] = hc_byte_perm_S (w[41], w[42], selector);
      w[43] = hc_byte_perm_S (w[40], w[41], selector);
      w[42] = hc_byte_perm_S (w[39], w[40], selector);
      w[41] = hc_byte_perm_S (w[38], w[39], selector);
      w[40] = hc_byte_perm_S (w[37], w[38], selector);
      w[39] = hc_byte_perm_S (w[36], w[37], selector);
      w[38] = hc_byte_perm_S (w[35], w[36], selector);
      w[37] = hc_byte_perm_S (w[34], w[35], selector);
      w[36] = hc_byte_perm_S (w[33], w[34], selector);
      w[35] = hc_byte_perm_S (w[32], w[33], selector);
      w[34] = hc_byte_perm_S (w[31], w[32], selector);
      w[33] = hc_byte_perm_S (w[30], w[31], selector);
      w[32] = hc_byte_perm_S (w[29], w[30], selector);
      w[31] = hc_byte_perm_S (w[28], w[29], selector);
      w[30] = hc_byte_perm_S (w[27], w[28], selector);
      w[29] = hc_byte_perm_S (w[26], w[27], selector);
      w[28] = hc_byte_perm_S (w[25], w[26], selector);
      w[27] = hc_byte_perm_S (w[24], w[25], selector);
      w[26] = hc_byte_perm_S (w[23], w[24], selector);
      w[25] = hc_byte_perm_S (w[22], w[23], selector);
      w[24] = hc_byte_perm_S (w[21], w[22], selector);
      w[23] = hc_byte_perm_S (w[20], w[21], selector);
      w[22] = hc_byte_perm_S (w[19], w[20], selector);
      w[21] = hc_byte_perm_S (w[18], w[19], selector);
      w[20] = hc_byte_perm_S (w[17], w[18], selector);
      w[19] = hc_byte_perm_S (w[16], w[17], selector);
      w[18] = hc_byte_perm_S (w[15], w[16], selector);
      w[17] = hc_byte_perm_S (w[14], w[15], selector);
      w[16] = hc_byte_perm_S (w[13], w[14], selector);
      w[15] = hc_byte_perm_S (w[12], w[13], selector);
      w[14] = hc_byte_perm_S (w[11], w[12], selector);
      w[13] = hc_byte_perm_S (w[10], w[11], selector);
      w[12] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[11] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[10] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[ 9] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[ 8] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[ 7] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 6] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 5] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 4] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 3] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 2] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_byte_perm_S (w[59], w[60], selector);
      w[62] = hc_byte_perm_S (w[58], w[59], selector);
      w[61] = hc_byte_perm_S (w[57], w[58], selector);
      w[60] = hc_byte_perm_S (w[56], w[57], selector);
      w[59] = hc_byte_perm_S (w[55], w[56], selector);
      w[58] = hc_byte_perm_S (w[54], w[55], selector);
      w[57] = hc_byte_perm_S (w[53], w[54], selector);
      w[56] = hc_byte_perm_S (w[52], w[53], selector);
      w[55] = hc_byte_perm_S (w[51], w[52], selector);
      w[54] = hc_byte_perm_S (w[50], w[51], selector);
      w[53] = hc_byte_perm_S (w[49], w[50], selector);
      w[52] = hc_byte_perm_S (w[48], w[49], selector);
      w[51] = hc_byte_perm_S (w[47], w[48], selector);
      w[50] = hc_byte_perm_S (w[46], w[47], selector);
      w[49] = hc_byte_perm_S (w[45], w[46], selector);
      w[48] = hc_byte_perm_S (w[44], w[45], selector);
      w[47] = hc_byte_perm_S (w[43], w[44], selector);
      w[46] = hc_byte_perm_S (w[42], w[43], selector);
      w[45] = hc_byte_perm_S (w[41], w[42], selector);
      w[44] = hc_byte_perm_S (w[40], w[41], selector);
      w[43] = hc_byte_perm_S (w[39], w[40], selector);
      w[42] = hc_byte_perm_S (w[38], w[39], selector);
      w[41] = hc_byte_perm_S (w[37], w[38], selector);
      w[40] = hc_byte_perm_S (w[36], w[37], selector);
      w[39] = hc_byte_perm_S (w[35], w[36], selector);
      w[38] = hc_byte_perm_S (w[34], w[35], selector);
      w[37] = hc_byte_perm_S (w[33], w[34], selector);
      w[36] = hc_byte_perm_S (w[32], w[33], selector);
      w[35] = hc_byte_perm_S (w[31], w[32], selector);
      w[34] = hc_byte_perm_S (w[30], w[31], selector);
      w[33] = hc_byte_perm_S (w[29], w[30], selector);
      w[32] = hc_byte_perm_S (w[28], w[29], selector);
      w[31] = hc_byte_perm_S (w[27], w[28], selector);
      w[30] = hc_byte_perm_S (w[26], w[27], selector);
      w[29] = hc_byte_perm_S (w[25], w[26], selector);
      w[28] = hc_byte_perm_S (w[24], w[25], selector);
      w[27] = hc_byte_perm_S (w[23], w[24], selector);
      w[26] = hc_byte_perm_S (w[22], w[23], selector);
      w[25] = hc_byte_perm_S (w[21], w[22], selector);
      w[24] = hc_byte_perm_S (w[20], w[21], selector);
      w[23] = hc_byte_perm_S (w[19], w[20], selector);
      w[22] = hc_byte_perm_S (w[18], w[19], selector);
      w[21] = hc_byte_perm_S (w[17], w[18], selector);
      w[20] = hc_byte_perm_S (w[16], w[17], selector);
      w[19] = hc_byte_perm_S (w[15], w[16], selector);
      w[18] = hc_byte_perm_S (w[14], w[15], selector);
      w[17] = hc_byte_perm_S (w[13], w[14], selector);
      w[16] = hc_byte_perm_S (w[12], w[13], selector);
      w[15] = hc_byte_perm_S (w[11], w[12], selector);
      w[14] = hc_byte_perm_S (w[10], w[11], selector);
      w[13] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[12] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[11] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[10] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[ 9] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[ 8] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 7] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 6] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 5] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 4] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 3] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_byte_perm_S (w[58], w[59], selector);
      w[62] = hc_byte_perm_S (w[57], w[58], selector);
      w[61] = hc_byte_perm_S (w[56], w[57], selector);
      w[60] = hc_byte_perm_S (w[55], w[56], selector);
      w[59] = hc_byte_perm_S (w[54], w[55], selector);
      w[58] = hc_byte_perm_S (w[53], w[54], selector);
      w[57] = hc_byte_perm_S (w[52], w[53], selector);
      w[56] = hc_byte_perm_S (w[51], w[52], selector);
      w[55] = hc_byte_perm_S (w[50], w[51], selector);
      w[54] = hc_byte_perm_S (w[49], w[50], selector);
      w[53] = hc_byte_perm_S (w[48], w[49], selector);
      w[52] = hc_byte_perm_S (w[47], w[48], selector);
      w[51] = hc_byte_perm_S (w[46], w[47], selector);
      w[50] = hc_byte_perm_S (w[45], w[46], selector);
      w[49] = hc_byte_perm_S (w[44], w[45], selector);
      w[48] = hc_byte_perm_S (w[43], w[44], selector);
      w[47] = hc_byte_perm_S (w[42], w[43], selector);
      w[46] = hc_byte_perm_S (w[41], w[42], selector);
      w[45] = hc_byte_perm_S (w[40], w[41], selector);
      w[44] = hc_byte_perm_S (w[39], w[40], selector);
      w[43] = hc_byte_perm_S (w[38], w[39], selector);
      w[42] = hc_byte_perm_S (w[37], w[38], selector);
      w[41] = hc_byte_perm_S (w[36], w[37], selector);
      w[40] = hc_byte_perm_S (w[35], w[36], selector);
      w[39] = hc_byte_perm_S (w[34], w[35], selector);
      w[38] = hc_byte_perm_S (w[33], w[34], selector);
      w[37] = hc_byte_perm_S (w[32], w[33], selector);
      w[36] = hc_byte_perm_S (w[31], w[32], selector);
      w[35] = hc_byte_perm_S (w[30], w[31], selector);
      w[34] = hc_byte_perm_S (w[29], w[30], selector);
      w[33] = hc_byte_perm_S (w[28], w[29], selector);
      w[32] = hc_byte_perm_S (w[27], w[28], selector);
      w[31] = hc_byte_perm_S (w[26], w[27], selector);
      w[30] = hc_byte_perm_S (w[25], w[26], selector);
      w[29] = hc_byte_perm_S (w[24], w[25], selector);
      w[28] = hc_byte_perm_S (w[23], w[24], selector);
      w[27] = hc_byte_perm_S (w[22], w[23], selector);
      w[26] = hc_byte_perm_S (w[21], w[22], selector);
      w[25] = hc_byte_perm_S (w[20], w[21], selector);
      w[24] = hc_byte_perm_S (w[19], w[20], selector);
      w[23] = hc_byte_perm_S (w[18], w[19], selector);
      w[22] = hc_byte_perm_S (w[17], w[18], selector);
      w[21] = hc_byte_perm_S (w[16], w[17], selector);
      w[20] = hc_byte_perm_S (w[15], w[16], selector);
      w[19] = hc_byte_perm_S (w[14], w[15], selector);
      w[18] = hc_byte_perm_S (w[13], w[14], selector);
      w[17] = hc_byte_perm_S (w[12], w[13], selector);
      w[16] = hc_byte_perm_S (w[11], w[12], selector);
      w[15] = hc_byte_perm_S (w[10], w[11], selector);
      w[14] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[13] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[12] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[11] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[10] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[ 9] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 8] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 7] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 6] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 5] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 4] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_byte_perm_S (w[57], w[58], selector);
      w[62] = hc_byte_perm_S (w[56], w[57], selector);
      w[61] = hc_byte_perm_S (w[55], w[56], selector);
      w[60] = hc_byte_perm_S (w[54], w[55], selector);
      w[59] = hc_byte_perm_S (w[53], w[54], selector);
      w[58] = hc_byte_perm_S (w[52], w[53], selector);
      w[57] = hc_byte_perm_S (w[51], w[52], selector);
      w[56] = hc_byte_perm_S (w[50], w[51], selector);
      w[55] = hc_byte_perm_S (w[49], w[50], selector);
      w[54] = hc_byte_perm_S (w[48], w[49], selector);
      w[53] = hc_byte_perm_S (w[47], w[48], selector);
      w[52] = hc_byte_perm_S (w[46], w[47], selector);
      w[51] = hc_byte_perm_S (w[45], w[46], selector);
      w[50] = hc_byte_perm_S (w[44], w[45], selector);
      w[49] = hc_byte_perm_S (w[43], w[44], selector);
      w[48] = hc_byte_perm_S (w[42], w[43], selector);
      w[47] = hc_byte_perm_S (w[41], w[42], selector);
      w[46] = hc_byte_perm_S (w[40], w[41], selector);
      w[45] = hc_byte_perm_S (w[39], w[40], selector);
      w[44] = hc_byte_perm_S (w[38], w[39], selector);
      w[43] = hc_byte_perm_S (w[37], w[38], selector);
      w[42] = hc_byte_perm_S (w[36], w[37], selector);
      w[41] = hc_byte_perm_S (w[35], w[36], selector);
      w[40] = hc_byte_perm_S (w[34], w[35], selector);
      w[39] = hc_byte_perm_S (w[33], w[34], selector);
      w[38] = hc_byte_perm_S (w[32], w[33], selector);
      w[37] = hc_byte_perm_S (w[31], w[32], selector);
      w[36] = hc_byte_perm_S (w[30], w[31], selector);
      w[35] = hc_byte_perm_S (w[29], w[30], selector);
      w[34] = hc_byte_perm_S (w[28], w[29], selector);
      w[33] = hc_byte_perm_S (w[27], w[28], selector);
      w[32] = hc_byte_perm_S (w[26], w[27], selector);
      w[31] = hc_byte_perm_S (w[25], w[26], selector);
      w[30] = hc_byte_perm_S (w[24], w[25], selector);
      w[29] = hc_byte_perm_S (w[23], w[24], selector);
      w[28] = hc_byte_perm_S (w[22], w[23], selector);
      w[27] = hc_byte_perm_S (w[21], w[22], selector);
      w[26] = hc_byte_perm_S (w[20], w[21], selector);
      w[25] = hc_byte_perm_S (w[19], w[20], selector);
      w[24] = hc_byte_perm_S (w[18], w[19], selector);
      w[23] = hc_byte_perm_S (w[17], w[18], selector);
      w[22] = hc_byte_perm_S (w[16], w[17], selector);
      w[21] = hc_byte_perm_S (w[15], w[16], selector);
      w[20] = hc_byte_perm_S (w[14], w[15], selector);
      w[19] = hc_byte_perm_S (w[13], w[14], selector);
      w[18] = hc_byte_perm_S (w[12], w[13], selector);
      w[17] = hc_byte_perm_S (w[11], w[12], selector);
      w[16] = hc_byte_perm_S (w[10], w[11], selector);
      w[15] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[14] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[13] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[12] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[11] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[10] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[ 9] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 8] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 7] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 6] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 5] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_byte_perm_S (w[56], w[57], selector);
      w[62] = hc_byte_perm_S (w[55], w[56], selector);
      w[61] = hc_byte_perm_S (w[54], w[55], selector);
      w[60] = hc_byte_perm_S (w[53], w[54], selector);
      w[59] = hc_byte_perm_S (w[52], w[53], selector);
      w[58] = hc_byte_perm_S (w[51], w[52], selector);
      w[57] = hc_byte_perm_S (w[50], w[51], selector);
      w[56] = hc_byte_perm_S (w[49], w[50], selector);
      w[55] = hc_byte_perm_S (w[48], w[49], selector);
      w[54] = hc_byte_perm_S (w[47], w[48], selector);
      w[53] = hc_byte_perm_S (w[46], w[47], selector);
      w[52] = hc_byte_perm_S (w[45], w[46], selector);
      w[51] = hc_byte_perm_S (w[44], w[45], selector);
      w[50] = hc_byte_perm_S (w[43], w[44], selector);
      w[49] = hc_byte_perm_S (w[42], w[43], selector);
      w[48] = hc_byte_perm_S (w[41], w[42], selector);
      w[47] = hc_byte_perm_S (w[40], w[41], selector);
      w[46] = hc_byte_perm_S (w[39], w[40], selector);
      w[45] = hc_byte_perm_S (w[38], w[39], selector);
      w[44] = hc_byte_perm_S (w[37], w[38], selector);
      w[43] = hc_byte_perm_S (w[36], w[37], selector);
      w[42] = hc_byte_perm_S (w[35], w[36], selector);
      w[41] = hc_byte_perm_S (w[34], w[35], selector);
      w[40] = hc_byte_perm_S (w[33], w[34], selector);
      w[39] = hc_byte_perm_S (w[32], w[33], selector);
      w[38] = hc_byte_perm_S (w[31], w[32], selector);
      w[37] = hc_byte_perm_S (w[30], w[31], selector);
      w[36] = hc_byte_perm_S (w[29], w[30], selector);
      w[35] = hc_byte_perm_S (w[28], w[29], selector);
      w[34] = hc_byte_perm_S (w[27], w[28], selector);
      w[33] = hc_byte_perm_S (w[26], w[27], selector);
      w[32] = hc_byte_perm_S (w[25], w[26], selector);
      w[31] = hc_byte_perm_S (w[24], w[25], selector);
      w[30] = hc_byte_perm_S (w[23], w[24], selector);
      w[29] = hc_byte_perm_S (w[22], w[23], selector);
      w[28] = hc_byte_perm_S (w[21], w[22], selector);
      w[27] = hc_byte_perm_S (w[20], w[21], selector);
      w[26] = hc_byte_perm_S (w[19], w[20], selector);
      w[25] = hc_byte_perm_S (w[18], w[19], selector);
      w[24] = hc_byte_perm_S (w[17], w[18], selector);
      w[23] = hc_byte_perm_S (w[16], w[17], selector);
      w[22] = hc_byte_perm_S (w[15], w[16], selector);
      w[21] = hc_byte_perm_S (w[14], w[15], selector);
      w[20] = hc_byte_perm_S (w[13], w[14], selector);
      w[19] = hc_byte_perm_S (w[12], w[13], selector);
      w[18] = hc_byte_perm_S (w[11], w[12], selector);
      w[17] = hc_byte_perm_S (w[10], w[11], selector);
      w[16] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[15] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[14] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[13] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[12] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[11] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[10] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[ 9] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 8] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 7] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 6] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_byte_perm_S (w[55], w[56], selector);
      w[62] = hc_byte_perm_S (w[54], w[55], selector);
      w[61] = hc_byte_perm_S (w[53], w[54], selector);
      w[60] = hc_byte_perm_S (w[52], w[53], selector);
      w[59] = hc_byte_perm_S (w[51], w[52], selector);
      w[58] = hc_byte_perm_S (w[50], w[51], selector);
      w[57] = hc_byte_perm_S (w[49], w[50], selector);
      w[56] = hc_byte_perm_S (w[48], w[49], selector);
      w[55] = hc_byte_perm_S (w[47], w[48], selector);
      w[54] = hc_byte_perm_S (w[46], w[47], selector);
      w[53] = hc_byte_perm_S (w[45], w[46], selector);
      w[52] = hc_byte_perm_S (w[44], w[45], selector);
      w[51] = hc_byte_perm_S (w[43], w[44], selector);
      w[50] = hc_byte_perm_S (w[42], w[43], selector);
      w[49] = hc_byte_perm_S (w[41], w[42], selector);
      w[48] = hc_byte_perm_S (w[40], w[41], selector);
      w[47] = hc_byte_perm_S (w[39], w[40], selector);
      w[46] = hc_byte_perm_S (w[38], w[39], selector);
      w[45] = hc_byte_perm_S (w[37], w[38], selector);
      w[44] = hc_byte_perm_S (w[36], w[37], selector);
      w[43] = hc_byte_perm_S (w[35], w[36], selector);
      w[42] = hc_byte_perm_S (w[34], w[35], selector);
      w[41] = hc_byte_perm_S (w[33], w[34], selector);
      w[40] = hc_byte_perm_S (w[32], w[33], selector);
      w[39] = hc_byte_perm_S (w[31], w[32], selector);
      w[38] = hc_byte_perm_S (w[30], w[31], selector);
      w[37] = hc_byte_perm_S (w[29], w[30], selector);
      w[36] = hc_byte_perm_S (w[28], w[29], selector);
      w[35] = hc_byte_perm_S (w[27], w[28], selector);
      w[34] = hc_byte_perm_S (w[26], w[27], selector);
      w[33] = hc_byte_perm_S (w[25], w[26], selector);
      w[32] = hc_byte_perm_S (w[24], w[25], selector);
      w[31] = hc_byte_perm_S (w[23], w[24], selector);
      w[30] = hc_byte_perm_S (w[22], w[23], selector);
      w[29] = hc_byte_perm_S (w[21], w[22], selector);
      w[28] = hc_byte_perm_S (w[20], w[21], selector);
      w[27] = hc_byte_perm_S (w[19], w[20], selector);
      w[26] = hc_byte_perm_S (w[18], w[19], selector);
      w[25] = hc_byte_perm_S (w[17], w[18], selector);
      w[24] = hc_byte_perm_S (w[16], w[17], selector);
      w[23] = hc_byte_perm_S (w[15], w[16], selector);
      w[22] = hc_byte_perm_S (w[14], w[15], selector);
      w[21] = hc_byte_perm_S (w[13], w[14], selector);
      w[20] = hc_byte_perm_S (w[12], w[13], selector);
      w[19] = hc_byte_perm_S (w[11], w[12], selector);
      w[18] = hc_byte_perm_S (w[10], w[11], selector);
      w[17] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[16] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[15] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[14] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[13] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[12] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[11] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[10] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[ 9] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 8] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 7] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_byte_perm_S (w[54], w[55], selector);
      w[62] = hc_byte_perm_S (w[53], w[54], selector);
      w[61] = hc_byte_perm_S (w[52], w[53], selector);
      w[60] = hc_byte_perm_S (w[51], w[52], selector);
      w[59] = hc_byte_perm_S (w[50], w[51], selector);
      w[58] = hc_byte_perm_S (w[49], w[50], selector);
      w[57] = hc_byte_perm_S (w[48], w[49], selector);
      w[56] = hc_byte_perm_S (w[47], w[48], selector);
      w[55] = hc_byte_perm_S (w[46], w[47], selector);
      w[54] = hc_byte_perm_S (w[45], w[46], selector);
      w[53] = hc_byte_perm_S (w[44], w[45], selector);
      w[52] = hc_byte_perm_S (w[43], w[44], selector);
      w[51] = hc_byte_perm_S (w[42], w[43], selector);
      w[50] = hc_byte_perm_S (w[41], w[42], selector);
      w[49] = hc_byte_perm_S (w[40], w[41], selector);
      w[48] = hc_byte_perm_S (w[39], w[40], selector);
      w[47] = hc_byte_perm_S (w[38], w[39], selector);
      w[46] = hc_byte_perm_S (w[37], w[38], selector);
      w[45] = hc_byte_perm_S (w[36], w[37], selector);
      w[44] = hc_byte_perm_S (w[35], w[36], selector);
      w[43] = hc_byte_perm_S (w[34], w[35], selector);
      w[42] = hc_byte_perm_S (w[33], w[34], selector);
      w[41] = hc_byte_perm_S (w[32], w[33], selector);
      w[40] = hc_byte_perm_S (w[31], w[32], selector);
      w[39] = hc_byte_perm_S (w[30], w[31], selector);
      w[38] = hc_byte_perm_S (w[29], w[30], selector);
      w[37] = hc_byte_perm_S (w[28], w[29], selector);
      w[36] = hc_byte_perm_S (w[27], w[28], selector);
      w[35] = hc_byte_perm_S (w[26], w[27], selector);
      w[34] = hc_byte_perm_S (w[25], w[26], selector);
      w[33] = hc_byte_perm_S (w[24], w[25], selector);
      w[32] = hc_byte_perm_S (w[23], w[24], selector);
      w[31] = hc_byte_perm_S (w[22], w[23], selector);
      w[30] = hc_byte_perm_S (w[21], w[22], selector);
      w[29] = hc_byte_perm_S (w[20], w[21], selector);
      w[28] = hc_byte_perm_S (w[19], w[20], selector);
      w[27] = hc_byte_perm_S (w[18], w[19], selector);
      w[26] = hc_byte_perm_S (w[17], w[18], selector);
      w[25] = hc_byte_perm_S (w[16], w[17], selector);
      w[24] = hc_byte_perm_S (w[15], w[16], selector);
      w[23] = hc_byte_perm_S (w[14], w[15], selector);
      w[22] = hc_byte_perm_S (w[13], w[14], selector);
      w[21] = hc_byte_perm_S (w[12], w[13], selector);
      w[20] = hc_byte_perm_S (w[11], w[12], selector);
      w[19] = hc_byte_perm_S (w[10], w[11], selector);
      w[18] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[17] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[16] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[15] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[14] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[13] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[12] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[11] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[10] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[ 9] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 8] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_byte_perm_S (w[53], w[54], selector);
      w[62] = hc_byte_perm_S (w[52], w[53], selector);
      w[61] = hc_byte_perm_S (w[51], w[52], selector);
      w[60] = hc_byte_perm_S (w[50], w[51], selector);
      w[59] = hc_byte_perm_S (w[49], w[50], selector);
      w[58] = hc_byte_perm_S (w[48], w[49], selector);
      w[57] = hc_byte_perm_S (w[47], w[48], selector);
      w[56] = hc_byte_perm_S (w[46], w[47], selector);
      w[55] = hc_byte_perm_S (w[45], w[46], selector);
      w[54] = hc_byte_perm_S (w[44], w[45], selector);
      w[53] = hc_byte_perm_S (w[43], w[44], selector);
      w[52] = hc_byte_perm_S (w[42], w[43], selector);
      w[51] = hc_byte_perm_S (w[41], w[42], selector);
      w[50] = hc_byte_perm_S (w[40], w[41], selector);
      w[49] = hc_byte_perm_S (w[39], w[40], selector);
      w[48] = hc_byte_perm_S (w[38], w[39], selector);
      w[47] = hc_byte_perm_S (w[37], w[38], selector);
      w[46] = hc_byte_perm_S (w[36], w[37], selector);
      w[45] = hc_byte_perm_S (w[35], w[36], selector);
      w[44] = hc_byte_perm_S (w[34], w[35], selector);
      w[43] = hc_byte_perm_S (w[33], w[34], selector);
      w[42] = hc_byte_perm_S (w[32], w[33], selector);
      w[41] = hc_byte_perm_S (w[31], w[32], selector);
      w[40] = hc_byte_perm_S (w[30], w[31], selector);
      w[39] = hc_byte_perm_S (w[29], w[30], selector);
      w[38] = hc_byte_perm_S (w[28], w[29], selector);
      w[37] = hc_byte_perm_S (w[27], w[28], selector);
      w[36] = hc_byte_perm_S (w[26], w[27], selector);
      w[35] = hc_byte_perm_S (w[25], w[26], selector);
      w[34] = hc_byte_perm_S (w[24], w[25], selector);
      w[33] = hc_byte_perm_S (w[23], w[24], selector);
      w[32] = hc_byte_perm_S (w[22], w[23], selector);
      w[31] = hc_byte_perm_S (w[21], w[22], selector);
      w[30] = hc_byte_perm_S (w[20], w[21], selector);
      w[29] = hc_byte_perm_S (w[19], w[20], selector);
      w[28] = hc_byte_perm_S (w[18], w[19], selector);
      w[27] = hc_byte_perm_S (w[17], w[18], selector);
      w[26] = hc_byte_perm_S (w[16], w[17], selector);
      w[25] = hc_byte_perm_S (w[15], w[16], selector);
      w[24] = hc_byte_perm_S (w[14], w[15], selector);
      w[23] = hc_byte_perm_S (w[13], w[14], selector);
      w[22] = hc_byte_perm_S (w[12], w[13], selector);
      w[21] = hc_byte_perm_S (w[11], w[12], selector);
      w[20] = hc_byte_perm_S (w[10], w[11], selector);
      w[19] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[18] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[17] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[16] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[15] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[14] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[13] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[12] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[11] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[10] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[ 9] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_byte_perm_S (w[52], w[53], selector);
      w[62] = hc_byte_perm_S (w[51], w[52], selector);
      w[61] = hc_byte_perm_S (w[50], w[51], selector);
      w[60] = hc_byte_perm_S (w[49], w[50], selector);
      w[59] = hc_byte_perm_S (w[48], w[49], selector);
      w[58] = hc_byte_perm_S (w[47], w[48], selector);
      w[57] = hc_byte_perm_S (w[46], w[47], selector);
      w[56] = hc_byte_perm_S (w[45], w[46], selector);
      w[55] = hc_byte_perm_S (w[44], w[45], selector);
      w[54] = hc_byte_perm_S (w[43], w[44], selector);
      w[53] = hc_byte_perm_S (w[42], w[43], selector);
      w[52] = hc_byte_perm_S (w[41], w[42], selector);
      w[51] = hc_byte_perm_S (w[40], w[41], selector);
      w[50] = hc_byte_perm_S (w[39], w[40], selector);
      w[49] = hc_byte_perm_S (w[38], w[39], selector);
      w[48] = hc_byte_perm_S (w[37], w[38], selector);
      w[47] = hc_byte_perm_S (w[36], w[37], selector);
      w[46] = hc_byte_perm_S (w[35], w[36], selector);
      w[45] = hc_byte_perm_S (w[34], w[35], selector);
      w[44] = hc_byte_perm_S (w[33], w[34], selector);
      w[43] = hc_byte_perm_S (w[32], w[33], selector);
      w[42] = hc_byte_perm_S (w[31], w[32], selector);
      w[41] = hc_byte_perm_S (w[30], w[31], selector);
      w[40] = hc_byte_perm_S (w[29], w[30], selector);
      w[39] = hc_byte_perm_S (w[28], w[29], selector);
      w[38] = hc_byte_perm_S (w[27], w[28], selector);
      w[37] = hc_byte_perm_S (w[26], w[27], selector);
      w[36] = hc_byte_perm_S (w[25], w[26], selector);
      w[35] = hc_byte_perm_S (w[24], w[25], selector);
      w[34] = hc_byte_perm_S (w[23], w[24], selector);
      w[33] = hc_byte_perm_S (w[22], w[23], selector);
      w[32] = hc_byte_perm_S (w[21], w[22], selector);
      w[31] = hc_byte_perm_S (w[20], w[21], selector);
      w[30] = hc_byte_perm_S (w[19], w[20], selector);
      w[29] = hc_byte_perm_S (w[18], w[19], selector);
      w[28] = hc_byte_perm_S (w[17], w[18], selector);
      w[27] = hc_byte_perm_S (w[16], w[17], selector);
      w[26] = hc_byte_perm_S (w[15], w[16], selector);
      w[25] = hc_byte_perm_S (w[14], w[15], selector);
      w[24] = hc_byte_perm_S (w[13], w[14], selector);
      w[23] = hc_byte_perm_S (w[12], w[13], selector);
      w[22] = hc_byte_perm_S (w[11], w[12], selector);
      w[21] = hc_byte_perm_S (w[10], w[11], selector);
      w[20] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[19] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[18] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[17] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[16] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[15] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[14] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[13] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[12] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[11] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[10] = hc_byte_perm_S (    0, w[ 0], selector);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_byte_perm_S (w[51], w[52], selector);
      w[62] = hc_byte_perm_S (w[50], w[51], selector);
      w[61] = hc_byte_perm_S (w[49], w[50], selector);
      w[60] = hc_byte_perm_S (w[48], w[49], selector);
      w[59] = hc_byte_perm_S (w[47], w[48], selector);
      w[58] = hc_byte_perm_S (w[46], w[47], selector);
      w[57] = hc_byte_perm_S (w[45], w[46], selector);
      w[56] = hc_byte_perm_S (w[44], w[45], selector);
      w[55] = hc_byte_perm_S (w[43], w[44], selector);
      w[54] = hc_byte_perm_S (w[42], w[43], selector);
      w[53] = hc_byte_perm_S (w[41], w[42], selector);
      w[52] = hc_byte_perm_S (w[40], w[41], selector);
      w[51] = hc_byte_perm_S (w[39], w[40], selector);
      w[50] = hc_byte_perm_S (w[38], w[39], selector);
      w[49] = hc_byte_perm_S (w[37], w[38], selector);
      w[48] = hc_byte_perm_S (w[36], w[37], selector);
      w[47] = hc_byte_perm_S (w[35], w[36], selector);
      w[46] = hc_byte_perm_S (w[34], w[35], selector);
      w[45] = hc_byte_perm_S (w[33], w[34], selector);
      w[44] = hc_byte_perm_S (w[32], w[33], selector);
      w[43] = hc_byte_perm_S (w[31], w[32], selector);
      w[42] = hc_byte_perm_S (w[30], w[31], selector);
      w[41] = hc_byte_perm_S (w[29], w[30], selector);
      w[40] = hc_byte_perm_S (w[28], w[29], selector);
      w[39] = hc_byte_perm_S (w[27], w[28], selector);
      w[38] = hc_byte_perm_S (w[26], w[27], selector);
      w[37] = hc_byte_perm_S (w[25], w[26], selector);
      w[36] = hc_byte_perm_S (w[24], w[25], selector);
      w[35] = hc_byte_perm_S (w[23], w[24], selector);
      w[34] = hc_byte_perm_S (w[22], w[23], selector);
      w[33] = hc_byte_perm_S (w[21], w[22], selector);
      w[32] = hc_byte_perm_S (w[20], w[21], selector);
      w[31] = hc_byte_perm_S (w[19], w[20], selector);
      w[30] = hc_byte_perm_S (w[18], w[19], selector);
      w[29] = hc_byte_perm_S (w[17], w[18], selector);
      w[28] = hc_byte_perm_S (w[16], w[17], selector);
      w[27] = hc_byte_perm_S (w[15], w[16], selector);
      w[26] = hc_byte_perm_S (w[14], w[15], selector);
      w[25] = hc_byte_perm_S (w[13], w[14], selector);
      w[24] = hc_byte_perm_S (w[12], w[13], selector);
      w[23] = hc_byte_perm_S (w[11], w[12], selector);
      w[22] = hc_byte_perm_S (w[10], w[11], selector);
      w[21] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[20] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[19] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[18] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[17] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[16] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[15] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[14] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[13] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[12] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[11] = hc_byte_perm_S (    0, w[ 0], selector);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_byte_perm_S (w[50], w[51], selector);
      w[62] = hc_byte_perm_S (w[49], w[50], selector);
      w[61] = hc_byte_perm_S (w[48], w[49], selector);
      w[60] = hc_byte_perm_S (w[47], w[48], selector);
      w[59] = hc_byte_perm_S (w[46], w[47], selector);
      w[58] = hc_byte_perm_S (w[45], w[46], selector);
      w[57] = hc_byte_perm_S (w[44], w[45], selector);
      w[56] = hc_byte_perm_S (w[43], w[44], selector);
      w[55] = hc_byte_perm_S (w[42], w[43], selector);
      w[54] = hc_byte_perm_S (w[41], w[42], selector);
      w[53] = hc_byte_perm_S (w[40], w[41], selector);
      w[52] = hc_byte_perm_S (w[39], w[40], selector);
      w[51] = hc_byte_perm_S (w[38], w[39], selector);
      w[50] = hc_byte_perm_S (w[37], w[38], selector);
      w[49] = hc_byte_perm_S (w[36], w[37], selector);
      w[48] = hc_byte_perm_S (w[35], w[36], selector);
      w[47] = hc_byte_perm_S (w[34], w[35], selector);
      w[46] = hc_byte_perm_S (w[33], w[34], selector);
      w[45] = hc_byte_perm_S (w[32], w[33], selector);
      w[44] = hc_byte_perm_S (w[31], w[32], selector);
      w[43] = hc_byte_perm_S (w[30], w[31], selector);
      w[42] = hc_byte_perm_S (w[29], w[30], selector);
      w[41] = hc_byte_perm_S (w[28], w[29], selector);
      w[40] = hc_byte_perm_S (w[27], w[28], selector);
      w[39] = hc_byte_perm_S (w[26], w[27], selector);
      w[38] = hc_byte_perm_S (w[25], w[26], selector);
      w[37] = hc_byte_perm_S (w[24], w[25], selector);
      w[36] = hc_byte_perm_S (w[23], w[24], selector);
      w[35] = hc_byte_perm_S (w[22], w[23], selector);
      w[34] = hc_byte_perm_S (w[21], w[22], selector);
      w[33] = hc_byte_perm_S (w[20], w[21], selector);
      w[32] = hc_byte_perm_S (w[19], w[20], selector);
      w[31] = hc_byte_perm_S (w[18], w[19], selector);
      w[30] = hc_byte_perm_S (w[17], w[18], selector);
      w[29] = hc_byte_perm_S (w[16], w[17], selector);
      w[28] = hc_byte_perm_S (w[15], w[16], selector);
      w[27] = hc_byte_perm_S (w[14], w[15], selector);
      w[26] = hc_byte_perm_S (w[13], w[14], selector);
      w[25] = hc_byte_perm_S (w[12], w[13], selector);
      w[24] = hc_byte_perm_S (w[11], w[12], selector);
      w[23] = hc_byte_perm_S (w[10], w[11], selector);
      w[22] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[21] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[20] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[19] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[18] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[17] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[16] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[15] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[14] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[13] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[12] = hc_byte_perm_S (    0, w[ 0], selector);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_byte_perm_S (w[49], w[50], selector);
      w[62] = hc_byte_perm_S (w[48], w[49], selector);
      w[61] = hc_byte_perm_S (w[47], w[48], selector);
      w[60] = hc_byte_perm_S (w[46], w[47], selector);
      w[59] = hc_byte_perm_S (w[45], w[46], selector);
      w[58] = hc_byte_perm_S (w[44], w[45], selector);
      w[57] = hc_byte_perm_S (w[43], w[44], selector);
      w[56] = hc_byte_perm_S (w[42], w[43], selector);
      w[55] = hc_byte_perm_S (w[41], w[42], selector);
      w[54] = hc_byte_perm_S (w[40], w[41], selector);
      w[53] = hc_byte_perm_S (w[39], w[40], selector);
      w[52] = hc_byte_perm_S (w[38], w[39], selector);
      w[51] = hc_byte_perm_S (w[37], w[38], selector);
      w[50] = hc_byte_perm_S (w[36], w[37], selector);
      w[49] = hc_byte_perm_S (w[35], w[36], selector);
      w[48] = hc_byte_perm_S (w[34], w[35], selector);
      w[47] = hc_byte_perm_S (w[33], w[34], selector);
      w[46] = hc_byte_perm_S (w[32], w[33], selector);
      w[45] = hc_byte_perm_S (w[31], w[32], selector);
      w[44] = hc_byte_perm_S (w[30], w[31], selector);
      w[43] = hc_byte_perm_S (w[29], w[30], selector);
      w[42] = hc_byte_perm_S (w[28], w[29], selector);
      w[41] = hc_byte_perm_S (w[27], w[28], selector);
      w[40] = hc_byte_perm_S (w[26], w[27], selector);
      w[39] = hc_byte_perm_S (w[25], w[26], selector);
      w[38] = hc_byte_perm_S (w[24], w[25], selector);
      w[37] = hc_byte_perm_S (w[23], w[24], selector);
      w[36] = hc_byte_perm_S (w[22], w[23], selector);
      w[35] = hc_byte_perm_S (w[21], w[22], selector);
      w[34] = hc_byte_perm_S (w[20], w[21], selector);
      w[33] = hc_byte_perm_S (w[19], w[20], selector);
      w[32] = hc_byte_perm_S (w[18], w[19], selector);
      w[31] = hc_byte_perm_S (w[17], w[18], selector);
      w[30] = hc_byte_perm_S (w[16], w[17], selector);
      w[29] = hc_byte_perm_S (w[15], w[16], selector);
      w[28] = hc_byte_perm_S (w[14], w[15], selector);
      w[27] = hc_byte_perm_S (w[13], w[14], selector);
      w[26] = hc_byte_perm_S (w[12], w[13], selector);
      w[25] = hc_byte_perm_S (w[11], w[12], selector);
      w[24] = hc_byte_perm_S (w[10], w[11], selector);
      w[23] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[22] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[21] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[20] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[19] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[18] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[17] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[16] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[15] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[14] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[13] = hc_byte_perm_S (    0, w[ 0], selector);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_byte_perm_S (w[48], w[49], selector);
      w[62] = hc_byte_perm_S (w[47], w[48], selector);
      w[61] = hc_byte_perm_S (w[46], w[47], selector);
      w[60] = hc_byte_perm_S (w[45], w[46], selector);
      w[59] = hc_byte_perm_S (w[44], w[45], selector);
      w[58] = hc_byte_perm_S (w[43], w[44], selector);
      w[57] = hc_byte_perm_S (w[42], w[43], selector);
      w[56] = hc_byte_perm_S (w[41], w[42], selector);
      w[55] = hc_byte_perm_S (w[40], w[41], selector);
      w[54] = hc_byte_perm_S (w[39], w[40], selector);
      w[53] = hc_byte_perm_S (w[38], w[39], selector);
      w[52] = hc_byte_perm_S (w[37], w[38], selector);
      w[51] = hc_byte_perm_S (w[36], w[37], selector);
      w[50] = hc_byte_perm_S (w[35], w[36], selector);
      w[49] = hc_byte_perm_S (w[34], w[35], selector);
      w[48] = hc_byte_perm_S (w[33], w[34], selector);
      w[47] = hc_byte_perm_S (w[32], w[33], selector);
      w[46] = hc_byte_perm_S (w[31], w[32], selector);
      w[45] = hc_byte_perm_S (w[30], w[31], selector);
      w[44] = hc_byte_perm_S (w[29], w[30], selector);
      w[43] = hc_byte_perm_S (w[28], w[29], selector);
      w[42] = hc_byte_perm_S (w[27], w[28], selector);
      w[41] = hc_byte_perm_S (w[26], w[27], selector);
      w[40] = hc_byte_perm_S (w[25], w[26], selector);
      w[39] = hc_byte_perm_S (w[24], w[25], selector);
      w[38] = hc_byte_perm_S (w[23], w[24], selector);
      w[37] = hc_byte_perm_S (w[22], w[23], selector);
      w[36] = hc_byte_perm_S (w[21], w[22], selector);
      w[35] = hc_byte_perm_S (w[20], w[21], selector);
      w[34] = hc_byte_perm_S (w[19], w[20], selector);
      w[33] = hc_byte_perm_S (w[18], w[19], selector);
      w[32] = hc_byte_perm_S (w[17], w[18], selector);
      w[31] = hc_byte_perm_S (w[16], w[17], selector);
      w[30] = hc_byte_perm_S (w[15], w[16], selector);
      w[29] = hc_byte_perm_S (w[14], w[15], selector);
      w[28] = hc_byte_perm_S (w[13], w[14], selector);
      w[27] = hc_byte_perm_S (w[12], w[13], selector);
      w[26] = hc_byte_perm_S (w[11], w[12], selector);
      w[25] = hc_byte_perm_S (w[10], w[11], selector);
      w[24] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[23] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[22] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[21] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[20] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[19] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[18] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[17] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[16] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[15] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[14] = hc_byte_perm_S (    0, w[ 0], selector);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_byte_perm_S (w[47], w[48], selector);
      w[62] = hc_byte_perm_S (w[46], w[47], selector);
      w[61] = hc_byte_perm_S (w[45], w[46], selector);
      w[60] = hc_byte_perm_S (w[44], w[45], selector);
      w[59] = hc_byte_perm_S (w[43], w[44], selector);
      w[58] = hc_byte_perm_S (w[42], w[43], selector);
      w[57] = hc_byte_perm_S (w[41], w[42], selector);
      w[56] = hc_byte_perm_S (w[40], w[41], selector);
      w[55] = hc_byte_perm_S (w[39], w[40], selector);
      w[54] = hc_byte_perm_S (w[38], w[39], selector);
      w[53] = hc_byte_perm_S (w[37], w[38], selector);
      w[52] = hc_byte_perm_S (w[36], w[37], selector);
      w[51] = hc_byte_perm_S (w[35], w[36], selector);
      w[50] = hc_byte_perm_S (w[34], w[35], selector);
      w[49] = hc_byte_perm_S (w[33], w[34], selector);
      w[48] = hc_byte_perm_S (w[32], w[33], selector);
      w[47] = hc_byte_perm_S (w[31], w[32], selector);
      w[46] = hc_byte_perm_S (w[30], w[31], selector);
      w[45] = hc_byte_perm_S (w[29], w[30], selector);
      w[44] = hc_byte_perm_S (w[28], w[29], selector);
      w[43] = hc_byte_perm_S (w[27], w[28], selector);
      w[42] = hc_byte_perm_S (w[26], w[27], selector);
      w[41] = hc_byte_perm_S (w[25], w[26], selector);
      w[40] = hc_byte_perm_S (w[24], w[25], selector);
      w[39] = hc_byte_perm_S (w[23], w[24], selector);
      w[38] = hc_byte_perm_S (w[22], w[23], selector);
      w[37] = hc_byte_perm_S (w[21], w[22], selector);
      w[36] = hc_byte_perm_S (w[20], w[21], selector);
      w[35] = hc_byte_perm_S (w[19], w[20], selector);
      w[34] = hc_byte_perm_S (w[18], w[19], selector);
      w[33] = hc_byte_perm_S (w[17], w[18], selector);
      w[32] = hc_byte_perm_S (w[16], w[17], selector);
      w[31] = hc_byte_perm_S (w[15], w[16], selector);
      w[30] = hc_byte_perm_S (w[14], w[15], selector);
      w[29] = hc_byte_perm_S (w[13], w[14], selector);
      w[28] = hc_byte_perm_S (w[12], w[13], selector);
      w[27] = hc_byte_perm_S (w[11], w[12], selector);
      w[26] = hc_byte_perm_S (w[10], w[11], selector);
      w[25] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[24] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[23] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[22] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[21] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[20] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[19] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[18] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[17] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[16] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[15] = hc_byte_perm_S (    0, w[ 0], selector);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_byte_perm_S (w[46], w[47], selector);
      w[62] = hc_byte_perm_S (w[45], w[46], selector);
      w[61] = hc_byte_perm_S (w[44], w[45], selector);
      w[60] = hc_byte_perm_S (w[43], w[44], selector);
      w[59] = hc_byte_perm_S (w[42], w[43], selector);
      w[58] = hc_byte_perm_S (w[41], w[42], selector);
      w[57] = hc_byte_perm_S (w[40], w[41], selector);
      w[56] = hc_byte_perm_S (w[39], w[40], selector);
      w[55] = hc_byte_perm_S (w[38], w[39], selector);
      w[54] = hc_byte_perm_S (w[37], w[38], selector);
      w[53] = hc_byte_perm_S (w[36], w[37], selector);
      w[52] = hc_byte_perm_S (w[35], w[36], selector);
      w[51] = hc_byte_perm_S (w[34], w[35], selector);
      w[50] = hc_byte_perm_S (w[33], w[34], selector);
      w[49] = hc_byte_perm_S (w[32], w[33], selector);
      w[48] = hc_byte_perm_S (w[31], w[32], selector);
      w[47] = hc_byte_perm_S (w[30], w[31], selector);
      w[46] = hc_byte_perm_S (w[29], w[30], selector);
      w[45] = hc_byte_perm_S (w[28], w[29], selector);
      w[44] = hc_byte_perm_S (w[27], w[28], selector);
      w[43] = hc_byte_perm_S (w[26], w[27], selector);
      w[42] = hc_byte_perm_S (w[25], w[26], selector);
      w[41] = hc_byte_perm_S (w[24], w[25], selector);
      w[40] = hc_byte_perm_S (w[23], w[24], selector);
      w[39] = hc_byte_perm_S (w[22], w[23], selector);
      w[38] = hc_byte_perm_S (w[21], w[22], selector);
      w[37] = hc_byte_perm_S (w[20], w[21], selector);
      w[36] = hc_byte_perm_S (w[19], w[20], selector);
      w[35] = hc_byte_perm_S (w[18], w[19], selector);
      w[34] = hc_byte_perm_S (w[17], w[18], selector);
      w[33] = hc_byte_perm_S (w[16], w[17], selector);
      w[32] = hc_byte_perm_S (w[15], w[16], selector);
      w[31] = hc_byte_perm_S (w[14], w[15], selector);
      w[30] = hc_byte_perm_S (w[13], w[14], selector);
      w[29] = hc_byte_perm_S (w[12], w[13], selector);
      w[28] = hc_byte_perm_S (w[11], w[12], selector);
      w[27] = hc_byte_perm_S (w[10], w[11], selector);
      w[26] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[25] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[24] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[23] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[22] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[21] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[20] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[19] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[18] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[17] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[16] = hc_byte_perm_S (    0, w[ 0], selector);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_byte_perm_S (w[45], w[46], selector);
      w[62] = hc_byte_perm_S (w[44], w[45], selector);
      w[61] = hc_byte_perm_S (w[43], w[44], selector);
      w[60] = hc_byte_perm_S (w[42], w[43], selector);
      w[59] = hc_byte_perm_S (w[41], w[42], selector);
      w[58] = hc_byte_perm_S (w[40], w[41], selector);
      w[57] = hc_byte_perm_S (w[39], w[40], selector);
      w[56] = hc_byte_perm_S (w[38], w[39], selector);
      w[55] = hc_byte_perm_S (w[37], w[38], selector);
      w[54] = hc_byte_perm_S (w[36], w[37], selector);
      w[53] = hc_byte_perm_S (w[35], w[36], selector);
      w[52] = hc_byte_perm_S (w[34], w[35], selector);
      w[51] = hc_byte_perm_S (w[33], w[34], selector);
      w[50] = hc_byte_perm_S (w[32], w[33], selector);
      w[49] = hc_byte_perm_S (w[31], w[32], selector);
      w[48] = hc_byte_perm_S (w[30], w[31], selector);
      w[47] = hc_byte_perm_S (w[29], w[30], selector);
      w[46] = hc_byte_perm_S (w[28], w[29], selector);
      w[45] = hc_byte_perm_S (w[27], w[28], selector);
      w[44] = hc_byte_perm_S (w[26], w[27], selector);
      w[43] = hc_byte_perm_S (w[25], w[26], selector);
      w[42] = hc_byte_perm_S (w[24], w[25], selector);
      w[41] = hc_byte_perm_S (w[23], w[24], selector);
      w[40] = hc_byte_perm_S (w[22], w[23], selector);
      w[39] = hc_byte_perm_S (w[21], w[22], selector);
      w[38] = hc_byte_perm_S (w[20], w[21], selector);
      w[37] = hc_byte_perm_S (w[19], w[20], selector);
      w[36] = hc_byte_perm_S (w[18], w[19], selector);
      w[35] = hc_byte_perm_S (w[17], w[18], selector);
      w[34] = hc_byte_perm_S (w[16], w[17], selector);
      w[33] = hc_byte_perm_S (w[15], w[16], selector);
      w[32] = hc_byte_perm_S (w[14], w[15], selector);
      w[31] = hc_byte_perm_S (w[13], w[14], selector);
      w[30] = hc_byte_perm_S (w[12], w[13], selector);
      w[29] = hc_byte_perm_S (w[11], w[12], selector);
      w[28] = hc_byte_perm_S (w[10], w[11], selector);
      w[27] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[26] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[25] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[24] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[23] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[22] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[21] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[20] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[19] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[18] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[17] = hc_byte_perm_S (    0, w[ 0], selector);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_byte_perm_S (w[44], w[45], selector);
      w[62] = hc_byte_perm_S (w[43], w[44], selector);
      w[61] = hc_byte_perm_S (w[42], w[43], selector);
      w[60] = hc_byte_perm_S (w[41], w[42], selector);
      w[59] = hc_byte_perm_S (w[40], w[41], selector);
      w[58] = hc_byte_perm_S (w[39], w[40], selector);
      w[57] = hc_byte_perm_S (w[38], w[39], selector);
      w[56] = hc_byte_perm_S (w[37], w[38], selector);
      w[55] = hc_byte_perm_S (w[36], w[37], selector);
      w[54] = hc_byte_perm_S (w[35], w[36], selector);
      w[53] = hc_byte_perm_S (w[34], w[35], selector);
      w[52] = hc_byte_perm_S (w[33], w[34], selector);
      w[51] = hc_byte_perm_S (w[32], w[33], selector);
      w[50] = hc_byte_perm_S (w[31], w[32], selector);
      w[49] = hc_byte_perm_S (w[30], w[31], selector);
      w[48] = hc_byte_perm_S (w[29], w[30], selector);
      w[47] = hc_byte_perm_S (w[28], w[29], selector);
      w[46] = hc_byte_perm_S (w[27], w[28], selector);
      w[45] = hc_byte_perm_S (w[26], w[27], selector);
      w[44] = hc_byte_perm_S (w[25], w[26], selector);
      w[43] = hc_byte_perm_S (w[24], w[25], selector);
      w[42] = hc_byte_perm_S (w[23], w[24], selector);
      w[41] = hc_byte_perm_S (w[22], w[23], selector);
      w[40] = hc_byte_perm_S (w[21], w[22], selector);
      w[39] = hc_byte_perm_S (w[20], w[21], selector);
      w[38] = hc_byte_perm_S (w[19], w[20], selector);
      w[37] = hc_byte_perm_S (w[18], w[19], selector);
      w[36] = hc_byte_perm_S (w[17], w[18], selector);
      w[35] = hc_byte_perm_S (w[16], w[17], selector);
      w[34] = hc_byte_perm_S (w[15], w[16], selector);
      w[33] = hc_byte_perm_S (w[14], w[15], selector);
      w[32] = hc_byte_perm_S (w[13], w[14], selector);
      w[31] = hc_byte_perm_S (w[12], w[13], selector);
      w[30] = hc_byte_perm_S (w[11], w[12], selector);
      w[29] = hc_byte_perm_S (w[10], w[11], selector);
      w[28] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[27] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[26] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[25] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[24] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[23] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[22] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[21] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[20] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[19] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[18] = hc_byte_perm_S (    0, w[ 0], selector);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_byte_perm_S (w[43], w[44], selector);
      w[62] = hc_byte_perm_S (w[42], w[43], selector);
      w[61] = hc_byte_perm_S (w[41], w[42], selector);
      w[60] = hc_byte_perm_S (w[40], w[41], selector);
      w[59] = hc_byte_perm_S (w[39], w[40], selector);
      w[58] = hc_byte_perm_S (w[38], w[39], selector);
      w[57] = hc_byte_perm_S (w[37], w[38], selector);
      w[56] = hc_byte_perm_S (w[36], w[37], selector);
      w[55] = hc_byte_perm_S (w[35], w[36], selector);
      w[54] = hc_byte_perm_S (w[34], w[35], selector);
      w[53] = hc_byte_perm_S (w[33], w[34], selector);
      w[52] = hc_byte_perm_S (w[32], w[33], selector);
      w[51] = hc_byte_perm_S (w[31], w[32], selector);
      w[50] = hc_byte_perm_S (w[30], w[31], selector);
      w[49] = hc_byte_perm_S (w[29], w[30], selector);
      w[48] = hc_byte_perm_S (w[28], w[29], selector);
      w[47] = hc_byte_perm_S (w[27], w[28], selector);
      w[46] = hc_byte_perm_S (w[26], w[27], selector);
      w[45] = hc_byte_perm_S (w[25], w[26], selector);
      w[44] = hc_byte_perm_S (w[24], w[25], selector);
      w[43] = hc_byte_perm_S (w[23], w[24], selector);
      w[42] = hc_byte_perm_S (w[22], w[23], selector);
      w[41] = hc_byte_perm_S (w[21], w[22], selector);
      w[40] = hc_byte_perm_S (w[20], w[21], selector);
      w[39] = hc_byte_perm_S (w[19], w[20], selector);
      w[38] = hc_byte_perm_S (w[18], w[19], selector);
      w[37] = hc_byte_perm_S (w[17], w[18], selector);
      w[36] = hc_byte_perm_S (w[16], w[17], selector);
      w[35] = hc_byte_perm_S (w[15], w[16], selector);
      w[34] = hc_byte_perm_S (w[14], w[15], selector);
      w[33] = hc_byte_perm_S (w[13], w[14], selector);
      w[32] = hc_byte_perm_S (w[12], w[13], selector);
      w[31] = hc_byte_perm_S (w[11], w[12], selector);
      w[30] = hc_byte_perm_S (w[10], w[11], selector);
      w[29] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[28] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[27] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[26] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[25] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[24] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[23] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[22] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[21] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[20] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[19] = hc_byte_perm_S (    0, w[ 0], selector);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_byte_perm_S (w[42], w[43], selector);
      w[62] = hc_byte_perm_S (w[41], w[42], selector);
      w[61] = hc_byte_perm_S (w[40], w[41], selector);
      w[60] = hc_byte_perm_S (w[39], w[40], selector);
      w[59] = hc_byte_perm_S (w[38], w[39], selector);
      w[58] = hc_byte_perm_S (w[37], w[38], selector);
      w[57] = hc_byte_perm_S (w[36], w[37], selector);
      w[56] = hc_byte_perm_S (w[35], w[36], selector);
      w[55] = hc_byte_perm_S (w[34], w[35], selector);
      w[54] = hc_byte_perm_S (w[33], w[34], selector);
      w[53] = hc_byte_perm_S (w[32], w[33], selector);
      w[52] = hc_byte_perm_S (w[31], w[32], selector);
      w[51] = hc_byte_perm_S (w[30], w[31], selector);
      w[50] = hc_byte_perm_S (w[29], w[30], selector);
      w[49] = hc_byte_perm_S (w[28], w[29], selector);
      w[48] = hc_byte_perm_S (w[27], w[28], selector);
      w[47] = hc_byte_perm_S (w[26], w[27], selector);
      w[46] = hc_byte_perm_S (w[25], w[26], selector);
      w[45] = hc_byte_perm_S (w[24], w[25], selector);
      w[44] = hc_byte_perm_S (w[23], w[24], selector);
      w[43] = hc_byte_perm_S (w[22], w[23], selector);
      w[42] = hc_byte_perm_S (w[21], w[22], selector);
      w[41] = hc_byte_perm_S (w[20], w[21], selector);
      w[40] = hc_byte_perm_S (w[19], w[20], selector);
      w[39] = hc_byte_perm_S (w[18], w[19], selector);
      w[38] = hc_byte_perm_S (w[17], w[18], selector);
      w[37] = hc_byte_perm_S (w[16], w[17], selector);
      w[36] = hc_byte_perm_S (w[15], w[16], selector);
      w[35] = hc_byte_perm_S (w[14], w[15], selector);
      w[34] = hc_byte_perm_S (w[13], w[14], selector);
      w[33] = hc_byte_perm_S (w[12], w[13], selector);
      w[32] = hc_byte_perm_S (w[11], w[12], selector);
      w[31] = hc_byte_perm_S (w[10], w[11], selector);
      w[30] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[29] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[28] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[27] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[26] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[25] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[24] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[23] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[22] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[21] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[20] = hc_byte_perm_S (    0, w[ 0], selector);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_byte_perm_S (w[41], w[42], selector);
      w[62] = hc_byte_perm_S (w[40], w[41], selector);
      w[61] = hc_byte_perm_S (w[39], w[40], selector);
      w[60] = hc_byte_perm_S (w[38], w[39], selector);
      w[59] = hc_byte_perm_S (w[37], w[38], selector);
      w[58] = hc_byte_perm_S (w[36], w[37], selector);
      w[57] = hc_byte_perm_S (w[35], w[36], selector);
      w[56] = hc_byte_perm_S (w[34], w[35], selector);
      w[55] = hc_byte_perm_S (w[33], w[34], selector);
      w[54] = hc_byte_perm_S (w[32], w[33], selector);
      w[53] = hc_byte_perm_S (w[31], w[32], selector);
      w[52] = hc_byte_perm_S (w[30], w[31], selector);
      w[51] = hc_byte_perm_S (w[29], w[30], selector);
      w[50] = hc_byte_perm_S (w[28], w[29], selector);
      w[49] = hc_byte_perm_S (w[27], w[28], selector);
      w[48] = hc_byte_perm_S (w[26], w[27], selector);
      w[47] = hc_byte_perm_S (w[25], w[26], selector);
      w[46] = hc_byte_perm_S (w[24], w[25], selector);
      w[45] = hc_byte_perm_S (w[23], w[24], selector);
      w[44] = hc_byte_perm_S (w[22], w[23], selector);
      w[43] = hc_byte_perm_S (w[21], w[22], selector);
      w[42] = hc_byte_perm_S (w[20], w[21], selector);
      w[41] = hc_byte_perm_S (w[19], w[20], selector);
      w[40] = hc_byte_perm_S (w[18], w[19], selector);
      w[39] = hc_byte_perm_S (w[17], w[18], selector);
      w[38] = hc_byte_perm_S (w[16], w[17], selector);
      w[37] = hc_byte_perm_S (w[15], w[16], selector);
      w[36] = hc_byte_perm_S (w[14], w[15], selector);
      w[35] = hc_byte_perm_S (w[13], w[14], selector);
      w[34] = hc_byte_perm_S (w[12], w[13], selector);
      w[33] = hc_byte_perm_S (w[11], w[12], selector);
      w[32] = hc_byte_perm_S (w[10], w[11], selector);
      w[31] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[30] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[29] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[28] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[27] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[26] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[25] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[24] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[23] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[22] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[21] = hc_byte_perm_S (    0, w[ 0], selector);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_byte_perm_S (w[40], w[41], selector);
      w[62] = hc_byte_perm_S (w[39], w[40], selector);
      w[61] = hc_byte_perm_S (w[38], w[39], selector);
      w[60] = hc_byte_perm_S (w[37], w[38], selector);
      w[59] = hc_byte_perm_S (w[36], w[37], selector);
      w[58] = hc_byte_perm_S (w[35], w[36], selector);
      w[57] = hc_byte_perm_S (w[34], w[35], selector);
      w[56] = hc_byte_perm_S (w[33], w[34], selector);
      w[55] = hc_byte_perm_S (w[32], w[33], selector);
      w[54] = hc_byte_perm_S (w[31], w[32], selector);
      w[53] = hc_byte_perm_S (w[30], w[31], selector);
      w[52] = hc_byte_perm_S (w[29], w[30], selector);
      w[51] = hc_byte_perm_S (w[28], w[29], selector);
      w[50] = hc_byte_perm_S (w[27], w[28], selector);
      w[49] = hc_byte_perm_S (w[26], w[27], selector);
      w[48] = hc_byte_perm_S (w[25], w[26], selector);
      w[47] = hc_byte_perm_S (w[24], w[25], selector);
      w[46] = hc_byte_perm_S (w[23], w[24], selector);
      w[45] = hc_byte_perm_S (w[22], w[23], selector);
      w[44] = hc_byte_perm_S (w[21], w[22], selector);
      w[43] = hc_byte_perm_S (w[20], w[21], selector);
      w[42] = hc_byte_perm_S (w[19], w[20], selector);
      w[41] = hc_byte_perm_S (w[18], w[19], selector);
      w[40] = hc_byte_perm_S (w[17], w[18], selector);
      w[39] = hc_byte_perm_S (w[16], w[17], selector);
      w[38] = hc_byte_perm_S (w[15], w[16], selector);
      w[37] = hc_byte_perm_S (w[14], w[15], selector);
      w[36] = hc_byte_perm_S (w[13], w[14], selector);
      w[35] = hc_byte_perm_S (w[12], w[13], selector);
      w[34] = hc_byte_perm_S (w[11], w[12], selector);
      w[33] = hc_byte_perm_S (w[10], w[11], selector);
      w[32] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[31] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[30] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[29] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[28] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[27] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[26] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[25] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[24] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[23] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[22] = hc_byte_perm_S (    0, w[ 0], selector);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_byte_perm_S (w[39], w[40], selector);
      w[62] = hc_byte_perm_S (w[38], w[39], selector);
      w[61] = hc_byte_perm_S (w[37], w[38], selector);
      w[60] = hc_byte_perm_S (w[36], w[37], selector);
      w[59] = hc_byte_perm_S (w[35], w[36], selector);
      w[58] = hc_byte_perm_S (w[34], w[35], selector);
      w[57] = hc_byte_perm_S (w[33], w[34], selector);
      w[56] = hc_byte_perm_S (w[32], w[33], selector);
      w[55] = hc_byte_perm_S (w[31], w[32], selector);
      w[54] = hc_byte_perm_S (w[30], w[31], selector);
      w[53] = hc_byte_perm_S (w[29], w[30], selector);
      w[52] = hc_byte_perm_S (w[28], w[29], selector);
      w[51] = hc_byte_perm_S (w[27], w[28], selector);
      w[50] = hc_byte_perm_S (w[26], w[27], selector);
      w[49] = hc_byte_perm_S (w[25], w[26], selector);
      w[48] = hc_byte_perm_S (w[24], w[25], selector);
      w[47] = hc_byte_perm_S (w[23], w[24], selector);
      w[46] = hc_byte_perm_S (w[22], w[23], selector);
      w[45] = hc_byte_perm_S (w[21], w[22], selector);
      w[44] = hc_byte_perm_S (w[20], w[21], selector);
      w[43] = hc_byte_perm_S (w[19], w[20], selector);
      w[42] = hc_byte_perm_S (w[18], w[19], selector);
      w[41] = hc_byte_perm_S (w[17], w[18], selector);
      w[40] = hc_byte_perm_S (w[16], w[17], selector);
      w[39] = hc_byte_perm_S (w[15], w[16], selector);
      w[38] = hc_byte_perm_S (w[14], w[15], selector);
      w[37] = hc_byte_perm_S (w[13], w[14], selector);
      w[36] = hc_byte_perm_S (w[12], w[13], selector);
      w[35] = hc_byte_perm_S (w[11], w[12], selector);
      w[34] = hc_byte_perm_S (w[10], w[11], selector);
      w[33] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[32] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[31] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[30] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[29] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[28] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[27] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[26] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[25] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[24] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[23] = hc_byte_perm_S (    0, w[ 0], selector);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_byte_perm_S (w[38], w[39], selector);
      w[62] = hc_byte_perm_S (w[37], w[38], selector);
      w[61] = hc_byte_perm_S (w[36], w[37], selector);
      w[60] = hc_byte_perm_S (w[35], w[36], selector);
      w[59] = hc_byte_perm_S (w[34], w[35], selector);
      w[58] = hc_byte_perm_S (w[33], w[34], selector);
      w[57] = hc_byte_perm_S (w[32], w[33], selector);
      w[56] = hc_byte_perm_S (w[31], w[32], selector);
      w[55] = hc_byte_perm_S (w[30], w[31], selector);
      w[54] = hc_byte_perm_S (w[29], w[30], selector);
      w[53] = hc_byte_perm_S (w[28], w[29], selector);
      w[52] = hc_byte_perm_S (w[27], w[28], selector);
      w[51] = hc_byte_perm_S (w[26], w[27], selector);
      w[50] = hc_byte_perm_S (w[25], w[26], selector);
      w[49] = hc_byte_perm_S (w[24], w[25], selector);
      w[48] = hc_byte_perm_S (w[23], w[24], selector);
      w[47] = hc_byte_perm_S (w[22], w[23], selector);
      w[46] = hc_byte_perm_S (w[21], w[22], selector);
      w[45] = hc_byte_perm_S (w[20], w[21], selector);
      w[44] = hc_byte_perm_S (w[19], w[20], selector);
      w[43] = hc_byte_perm_S (w[18], w[19], selector);
      w[42] = hc_byte_perm_S (w[17], w[18], selector);
      w[41] = hc_byte_perm_S (w[16], w[17], selector);
      w[40] = hc_byte_perm_S (w[15], w[16], selector);
      w[39] = hc_byte_perm_S (w[14], w[15], selector);
      w[38] = hc_byte_perm_S (w[13], w[14], selector);
      w[37] = hc_byte_perm_S (w[12], w[13], selector);
      w[36] = hc_byte_perm_S (w[11], w[12], selector);
      w[35] = hc_byte_perm_S (w[10], w[11], selector);
      w[34] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[33] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[32] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[31] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[30] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[29] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[28] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[27] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[26] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[25] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[24] = hc_byte_perm_S (    0, w[ 0], selector);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_byte_perm_S (w[37], w[38], selector);
      w[62] = hc_byte_perm_S (w[36], w[37], selector);
      w[61] = hc_byte_perm_S (w[35], w[36], selector);
      w[60] = hc_byte_perm_S (w[34], w[35], selector);
      w[59] = hc_byte_perm_S (w[33], w[34], selector);
      w[58] = hc_byte_perm_S (w[32], w[33], selector);
      w[57] = hc_byte_perm_S (w[31], w[32], selector);
      w[56] = hc_byte_perm_S (w[30], w[31], selector);
      w[55] = hc_byte_perm_S (w[29], w[30], selector);
      w[54] = hc_byte_perm_S (w[28], w[29], selector);
      w[53] = hc_byte_perm_S (w[27], w[28], selector);
      w[52] = hc_byte_perm_S (w[26], w[27], selector);
      w[51] = hc_byte_perm_S (w[25], w[26], selector);
      w[50] = hc_byte_perm_S (w[24], w[25], selector);
      w[49] = hc_byte_perm_S (w[23], w[24], selector);
      w[48] = hc_byte_perm_S (w[22], w[23], selector);
      w[47] = hc_byte_perm_S (w[21], w[22], selector);
      w[46] = hc_byte_perm_S (w[20], w[21], selector);
      w[45] = hc_byte_perm_S (w[19], w[20], selector);
      w[44] = hc_byte_perm_S (w[18], w[19], selector);
      w[43] = hc_byte_perm_S (w[17], w[18], selector);
      w[42] = hc_byte_perm_S (w[16], w[17], selector);
      w[41] = hc_byte_perm_S (w[15], w[16], selector);
      w[40] = hc_byte_perm_S (w[14], w[15], selector);
      w[39] = hc_byte_perm_S (w[13], w[14], selector);
      w[38] = hc_byte_perm_S (w[12], w[13], selector);
      w[37] = hc_byte_perm_S (w[11], w[12], selector);
      w[36] = hc_byte_perm_S (w[10], w[11], selector);
      w[35] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[34] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[33] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[32] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[31] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[30] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[29] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[28] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[27] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[26] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[25] = hc_byte_perm_S (    0, w[ 0], selector);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_byte_perm_S (w[36], w[37], selector);
      w[62] = hc_byte_perm_S (w[35], w[36], selector);
      w[61] = hc_byte_perm_S (w[34], w[35], selector);
      w[60] = hc_byte_perm_S (w[33], w[34], selector);
      w[59] = hc_byte_perm_S (w[32], w[33], selector);
      w[58] = hc_byte_perm_S (w[31], w[32], selector);
      w[57] = hc_byte_perm_S (w[30], w[31], selector);
      w[56] = hc_byte_perm_S (w[29], w[30], selector);
      w[55] = hc_byte_perm_S (w[28], w[29], selector);
      w[54] = hc_byte_perm_S (w[27], w[28], selector);
      w[53] = hc_byte_perm_S (w[26], w[27], selector);
      w[52] = hc_byte_perm_S (w[25], w[26], selector);
      w[51] = hc_byte_perm_S (w[24], w[25], selector);
      w[50] = hc_byte_perm_S (w[23], w[24], selector);
      w[49] = hc_byte_perm_S (w[22], w[23], selector);
      w[48] = hc_byte_perm_S (w[21], w[22], selector);
      w[47] = hc_byte_perm_S (w[20], w[21], selector);
      w[46] = hc_byte_perm_S (w[19], w[20], selector);
      w[45] = hc_byte_perm_S (w[18], w[19], selector);
      w[44] = hc_byte_perm_S (w[17], w[18], selector);
      w[43] = hc_byte_perm_S (w[16], w[17], selector);
      w[42] = hc_byte_perm_S (w[15], w[16], selector);
      w[41] = hc_byte_perm_S (w[14], w[15], selector);
      w[40] = hc_byte_perm_S (w[13], w[14], selector);
      w[39] = hc_byte_perm_S (w[12], w[13], selector);
      w[38] = hc_byte_perm_S (w[11], w[12], selector);
      w[37] = hc_byte_perm_S (w[10], w[11], selector);
      w[36] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[35] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[34] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[33] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[32] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[31] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[30] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[29] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[28] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[27] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[26] = hc_byte_perm_S (    0, w[ 0], selector);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_byte_perm_S (w[35], w[36], selector);
      w[62] = hc_byte_perm_S (w[34], w[35], selector);
      w[61] = hc_byte_perm_S (w[33], w[34], selector);
      w[60] = hc_byte_perm_S (w[32], w[33], selector);
      w[59] = hc_byte_perm_S (w[31], w[32], selector);
      w[58] = hc_byte_perm_S (w[30], w[31], selector);
      w[57] = hc_byte_perm_S (w[29], w[30], selector);
      w[56] = hc_byte_perm_S (w[28], w[29], selector);
      w[55] = hc_byte_perm_S (w[27], w[28], selector);
      w[54] = hc_byte_perm_S (w[26], w[27], selector);
      w[53] = hc_byte_perm_S (w[25], w[26], selector);
      w[52] = hc_byte_perm_S (w[24], w[25], selector);
      w[51] = hc_byte_perm_S (w[23], w[24], selector);
      w[50] = hc_byte_perm_S (w[22], w[23], selector);
      w[49] = hc_byte_perm_S (w[21], w[22], selector);
      w[48] = hc_byte_perm_S (w[20], w[21], selector);
      w[47] = hc_byte_perm_S (w[19], w[20], selector);
      w[46] = hc_byte_perm_S (w[18], w[19], selector);
      w[45] = hc_byte_perm_S (w[17], w[18], selector);
      w[44] = hc_byte_perm_S (w[16], w[17], selector);
      w[43] = hc_byte_perm_S (w[15], w[16], selector);
      w[42] = hc_byte_perm_S (w[14], w[15], selector);
      w[41] = hc_byte_perm_S (w[13], w[14], selector);
      w[40] = hc_byte_perm_S (w[12], w[13], selector);
      w[39] = hc_byte_perm_S (w[11], w[12], selector);
      w[38] = hc_byte_perm_S (w[10], w[11], selector);
      w[37] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[36] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[35] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[34] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[33] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[32] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[31] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[30] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[29] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[28] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[27] = hc_byte_perm_S (    0, w[ 0], selector);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_byte_perm_S (w[34], w[35], selector);
      w[62] = hc_byte_perm_S (w[33], w[34], selector);
      w[61] = hc_byte_perm_S (w[32], w[33], selector);
      w[60] = hc_byte_perm_S (w[31], w[32], selector);
      w[59] = hc_byte_perm_S (w[30], w[31], selector);
      w[58] = hc_byte_perm_S (w[29], w[30], selector);
      w[57] = hc_byte_perm_S (w[28], w[29], selector);
      w[56] = hc_byte_perm_S (w[27], w[28], selector);
      w[55] = hc_byte_perm_S (w[26], w[27], selector);
      w[54] = hc_byte_perm_S (w[25], w[26], selector);
      w[53] = hc_byte_perm_S (w[24], w[25], selector);
      w[52] = hc_byte_perm_S (w[23], w[24], selector);
      w[51] = hc_byte_perm_S (w[22], w[23], selector);
      w[50] = hc_byte_perm_S (w[21], w[22], selector);
      w[49] = hc_byte_perm_S (w[20], w[21], selector);
      w[48] = hc_byte_perm_S (w[19], w[20], selector);
      w[47] = hc_byte_perm_S (w[18], w[19], selector);
      w[46] = hc_byte_perm_S (w[17], w[18], selector);
      w[45] = hc_byte_perm_S (w[16], w[17], selector);
      w[44] = hc_byte_perm_S (w[15], w[16], selector);
      w[43] = hc_byte_perm_S (w[14], w[15], selector);
      w[42] = hc_byte_perm_S (w[13], w[14], selector);
      w[41] = hc_byte_perm_S (w[12], w[13], selector);
      w[40] = hc_byte_perm_S (w[11], w[12], selector);
      w[39] = hc_byte_perm_S (w[10], w[11], selector);
      w[38] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[37] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[36] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[35] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[34] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[33] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[32] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[31] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[30] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[29] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[28] = hc_byte_perm_S (    0, w[ 0], selector);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_byte_perm_S (w[33], w[34], selector);
      w[62] = hc_byte_perm_S (w[32], w[33], selector);
      w[61] = hc_byte_perm_S (w[31], w[32], selector);
      w[60] = hc_byte_perm_S (w[30], w[31], selector);
      w[59] = hc_byte_perm_S (w[29], w[30], selector);
      w[58] = hc_byte_perm_S (w[28], w[29], selector);
      w[57] = hc_byte_perm_S (w[27], w[28], selector);
      w[56] = hc_byte_perm_S (w[26], w[27], selector);
      w[55] = hc_byte_perm_S (w[25], w[26], selector);
      w[54] = hc_byte_perm_S (w[24], w[25], selector);
      w[53] = hc_byte_perm_S (w[23], w[24], selector);
      w[52] = hc_byte_perm_S (w[22], w[23], selector);
      w[51] = hc_byte_perm_S (w[21], w[22], selector);
      w[50] = hc_byte_perm_S (w[20], w[21], selector);
      w[49] = hc_byte_perm_S (w[19], w[20], selector);
      w[48] = hc_byte_perm_S (w[18], w[19], selector);
      w[47] = hc_byte_perm_S (w[17], w[18], selector);
      w[46] = hc_byte_perm_S (w[16], w[17], selector);
      w[45] = hc_byte_perm_S (w[15], w[16], selector);
      w[44] = hc_byte_perm_S (w[14], w[15], selector);
      w[43] = hc_byte_perm_S (w[13], w[14], selector);
      w[42] = hc_byte_perm_S (w[12], w[13], selector);
      w[41] = hc_byte_perm_S (w[11], w[12], selector);
      w[40] = hc_byte_perm_S (w[10], w[11], selector);
      w[39] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[38] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[37] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[36] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[35] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[34] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[33] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[32] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[31] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[30] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[29] = hc_byte_perm_S (    0, w[ 0], selector);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_byte_perm_S (w[32], w[33], selector);
      w[62] = hc_byte_perm_S (w[31], w[32], selector);
      w[61] = hc_byte_perm_S (w[30], w[31], selector);
      w[60] = hc_byte_perm_S (w[29], w[30], selector);
      w[59] = hc_byte_perm_S (w[28], w[29], selector);
      w[58] = hc_byte_perm_S (w[27], w[28], selector);
      w[57] = hc_byte_perm_S (w[26], w[27], selector);
      w[56] = hc_byte_perm_S (w[25], w[26], selector);
      w[55] = hc_byte_perm_S (w[24], w[25], selector);
      w[54] = hc_byte_perm_S (w[23], w[24], selector);
      w[53] = hc_byte_perm_S (w[22], w[23], selector);
      w[52] = hc_byte_perm_S (w[21], w[22], selector);
      w[51] = hc_byte_perm_S (w[20], w[21], selector);
      w[50] = hc_byte_perm_S (w[19], w[20], selector);
      w[49] = hc_byte_perm_S (w[18], w[19], selector);
      w[48] = hc_byte_perm_S (w[17], w[18], selector);
      w[47] = hc_byte_perm_S (w[16], w[17], selector);
      w[46] = hc_byte_perm_S (w[15], w[16], selector);
      w[45] = hc_byte_perm_S (w[14], w[15], selector);
      w[44] = hc_byte_perm_S (w[13], w[14], selector);
      w[43] = hc_byte_perm_S (w[12], w[13], selector);
      w[42] = hc_byte_perm_S (w[11], w[12], selector);
      w[41] = hc_byte_perm_S (w[10], w[11], selector);
      w[40] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[39] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[38] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[37] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[36] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[35] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[34] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[33] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[32] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[31] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[30] = hc_byte_perm_S (    0, w[ 0], selector);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_byte_perm_S (w[31], w[32], selector);
      w[62] = hc_byte_perm_S (w[30], w[31], selector);
      w[61] = hc_byte_perm_S (w[29], w[30], selector);
      w[60] = hc_byte_perm_S (w[28], w[29], selector);
      w[59] = hc_byte_perm_S (w[27], w[28], selector);
      w[58] = hc_byte_perm_S (w[26], w[27], selector);
      w[57] = hc_byte_perm_S (w[25], w[26], selector);
      w[56] = hc_byte_perm_S (w[24], w[25], selector);
      w[55] = hc_byte_perm_S (w[23], w[24], selector);
      w[54] = hc_byte_perm_S (w[22], w[23], selector);
      w[53] = hc_byte_perm_S (w[21], w[22], selector);
      w[52] = hc_byte_perm_S (w[20], w[21], selector);
      w[51] = hc_byte_perm_S (w[19], w[20], selector);
      w[50] = hc_byte_perm_S (w[18], w[19], selector);
      w[49] = hc_byte_perm_S (w[17], w[18], selector);
      w[48] = hc_byte_perm_S (w[16], w[17], selector);
      w[47] = hc_byte_perm_S (w[15], w[16], selector);
      w[46] = hc_byte_perm_S (w[14], w[15], selector);
      w[45] = hc_byte_perm_S (w[13], w[14], selector);
      w[44] = hc_byte_perm_S (w[12], w[13], selector);
      w[43] = hc_byte_perm_S (w[11], w[12], selector);
      w[42] = hc_byte_perm_S (w[10], w[11], selector);
      w[41] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[40] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[39] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[38] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[37] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[36] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[35] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[34] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[33] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[32] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[31] = hc_byte_perm_S (    0, w[ 0], selector);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_byte_perm_S (w[30], w[31], selector);
      w[62] = hc_byte_perm_S (w[29], w[30], selector);
      w[61] = hc_byte_perm_S (w[28], w[29], selector);
      w[60] = hc_byte_perm_S (w[27], w[28], selector);
      w[59] = hc_byte_perm_S (w[26], w[27], selector);
      w[58] = hc_byte_perm_S (w[25], w[26], selector);
      w[57] = hc_byte_perm_S (w[24], w[25], selector);
      w[56] = hc_byte_perm_S (w[23], w[24], selector);
      w[55] = hc_byte_perm_S (w[22], w[23], selector);
      w[54] = hc_byte_perm_S (w[21], w[22], selector);
      w[53] = hc_byte_perm_S (w[20], w[21], selector);
      w[52] = hc_byte_perm_S (w[19], w[20], selector);
      w[51] = hc_byte_perm_S (w[18], w[19], selector);
      w[50] = hc_byte_perm_S (w[17], w[18], selector);
      w[49] = hc_byte_perm_S (w[16], w[17], selector);
      w[48] = hc_byte_perm_S (w[15], w[16], selector);
      w[47] = hc_byte_perm_S (w[14], w[15], selector);
      w[46] = hc_byte_perm_S (w[13], w[14], selector);
      w[45] = hc_byte_perm_S (w[12], w[13], selector);
      w[44] = hc_byte_perm_S (w[11], w[12], selector);
      w[43] = hc_byte_perm_S (w[10], w[11], selector);
      w[42] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[41] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[40] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[39] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[38] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[37] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[36] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[35] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[34] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[33] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[32] = hc_byte_perm_S (    0, w[ 0], selector);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_byte_perm_S (w[29], w[30], selector);
      w[62] = hc_byte_perm_S (w[28], w[29], selector);
      w[61] = hc_byte_perm_S (w[27], w[28], selector);
      w[60] = hc_byte_perm_S (w[26], w[27], selector);
      w[59] = hc_byte_perm_S (w[25], w[26], selector);
      w[58] = hc_byte_perm_S (w[24], w[25], selector);
      w[57] = hc_byte_perm_S (w[23], w[24], selector);
      w[56] = hc_byte_perm_S (w[22], w[23], selector);
      w[55] = hc_byte_perm_S (w[21], w[22], selector);
      w[54] = hc_byte_perm_S (w[20], w[21], selector);
      w[53] = hc_byte_perm_S (w[19], w[20], selector);
      w[52] = hc_byte_perm_S (w[18], w[19], selector);
      w[51] = hc_byte_perm_S (w[17], w[18], selector);
      w[50] = hc_byte_perm_S (w[16], w[17], selector);
      w[49] = hc_byte_perm_S (w[15], w[16], selector);
      w[48] = hc_byte_perm_S (w[14], w[15], selector);
      w[47] = hc_byte_perm_S (w[13], w[14], selector);
      w[46] = hc_byte_perm_S (w[12], w[13], selector);
      w[45] = hc_byte_perm_S (w[11], w[12], selector);
      w[44] = hc_byte_perm_S (w[10], w[11], selector);
      w[43] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[42] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[41] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[40] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[39] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[38] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[37] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[36] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[35] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[34] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[33] = hc_byte_perm_S (    0, w[ 0], selector);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_byte_perm_S (w[28], w[29], selector);
      w[62] = hc_byte_perm_S (w[27], w[28], selector);
      w[61] = hc_byte_perm_S (w[26], w[27], selector);
      w[60] = hc_byte_perm_S (w[25], w[26], selector);
      w[59] = hc_byte_perm_S (w[24], w[25], selector);
      w[58] = hc_byte_perm_S (w[23], w[24], selector);
      w[57] = hc_byte_perm_S (w[22], w[23], selector);
      w[56] = hc_byte_perm_S (w[21], w[22], selector);
      w[55] = hc_byte_perm_S (w[20], w[21], selector);
      w[54] = hc_byte_perm_S (w[19], w[20], selector);
      w[53] = hc_byte_perm_S (w[18], w[19], selector);
      w[52] = hc_byte_perm_S (w[17], w[18], selector);
      w[51] = hc_byte_perm_S (w[16], w[17], selector);
      w[50] = hc_byte_perm_S (w[15], w[16], selector);
      w[49] = hc_byte_perm_S (w[14], w[15], selector);
      w[48] = hc_byte_perm_S (w[13], w[14], selector);
      w[47] = hc_byte_perm_S (w[12], w[13], selector);
      w[46] = hc_byte_perm_S (w[11], w[12], selector);
      w[45] = hc_byte_perm_S (w[10], w[11], selector);
      w[44] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[43] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[42] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[41] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[40] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[39] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[38] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[37] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[36] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[35] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[34] = hc_byte_perm_S (    0, w[ 0], selector);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_byte_perm_S (w[27], w[28], selector);
      w[62] = hc_byte_perm_S (w[26], w[27], selector);
      w[61] = hc_byte_perm_S (w[25], w[26], selector);
      w[60] = hc_byte_perm_S (w[24], w[25], selector);
      w[59] = hc_byte_perm_S (w[23], w[24], selector);
      w[58] = hc_byte_perm_S (w[22], w[23], selector);
      w[57] = hc_byte_perm_S (w[21], w[22], selector);
      w[56] = hc_byte_perm_S (w[20], w[21], selector);
      w[55] = hc_byte_perm_S (w[19], w[20], selector);
      w[54] = hc_byte_perm_S (w[18], w[19], selector);
      w[53] = hc_byte_perm_S (w[17], w[18], selector);
      w[52] = hc_byte_perm_S (w[16], w[17], selector);
      w[51] = hc_byte_perm_S (w[15], w[16], selector);
      w[50] = hc_byte_perm_S (w[14], w[15], selector);
      w[49] = hc_byte_perm_S (w[13], w[14], selector);
      w[48] = hc_byte_perm_S (w[12], w[13], selector);
      w[47] = hc_byte_perm_S (w[11], w[12], selector);
      w[46] = hc_byte_perm_S (w[10], w[11], selector);
      w[45] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[44] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[43] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[42] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[41] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[40] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[39] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[38] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[37] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[36] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[35] = hc_byte_perm_S (    0, w[ 0], selector);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_byte_perm_S (w[26], w[27], selector);
      w[62] = hc_byte_perm_S (w[25], w[26], selector);
      w[61] = hc_byte_perm_S (w[24], w[25], selector);
      w[60] = hc_byte_perm_S (w[23], w[24], selector);
      w[59] = hc_byte_perm_S (w[22], w[23], selector);
      w[58] = hc_byte_perm_S (w[21], w[22], selector);
      w[57] = hc_byte_perm_S (w[20], w[21], selector);
      w[56] = hc_byte_perm_S (w[19], w[20], selector);
      w[55] = hc_byte_perm_S (w[18], w[19], selector);
      w[54] = hc_byte_perm_S (w[17], w[18], selector);
      w[53] = hc_byte_perm_S (w[16], w[17], selector);
      w[52] = hc_byte_perm_S (w[15], w[16], selector);
      w[51] = hc_byte_perm_S (w[14], w[15], selector);
      w[50] = hc_byte_perm_S (w[13], w[14], selector);
      w[49] = hc_byte_perm_S (w[12], w[13], selector);
      w[48] = hc_byte_perm_S (w[11], w[12], selector);
      w[47] = hc_byte_perm_S (w[10], w[11], selector);
      w[46] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[45] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[44] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[43] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[42] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[41] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[40] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[39] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[38] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[37] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[36] = hc_byte_perm_S (    0, w[ 0], selector);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_byte_perm_S (w[25], w[26], selector);
      w[62] = hc_byte_perm_S (w[24], w[25], selector);
      w[61] = hc_byte_perm_S (w[23], w[24], selector);
      w[60] = hc_byte_perm_S (w[22], w[23], selector);
      w[59] = hc_byte_perm_S (w[21], w[22], selector);
      w[58] = hc_byte_perm_S (w[20], w[21], selector);
      w[57] = hc_byte_perm_S (w[19], w[20], selector);
      w[56] = hc_byte_perm_S (w[18], w[19], selector);
      w[55] = hc_byte_perm_S (w[17], w[18], selector);
      w[54] = hc_byte_perm_S (w[16], w[17], selector);
      w[53] = hc_byte_perm_S (w[15], w[16], selector);
      w[52] = hc_byte_perm_S (w[14], w[15], selector);
      w[51] = hc_byte_perm_S (w[13], w[14], selector);
      w[50] = hc_byte_perm_S (w[12], w[13], selector);
      w[49] = hc_byte_perm_S (w[11], w[12], selector);
      w[48] = hc_byte_perm_S (w[10], w[11], selector);
      w[47] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[46] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[45] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[44] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[43] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[42] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[41] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[40] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[39] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[38] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[37] = hc_byte_perm_S (    0, w[ 0], selector);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_byte_perm_S (w[24], w[25], selector);
      w[62] = hc_byte_perm_S (w[23], w[24], selector);
      w[61] = hc_byte_perm_S (w[22], w[23], selector);
      w[60] = hc_byte_perm_S (w[21], w[22], selector);
      w[59] = hc_byte_perm_S (w[20], w[21], selector);
      w[58] = hc_byte_perm_S (w[19], w[20], selector);
      w[57] = hc_byte_perm_S (w[18], w[19], selector);
      w[56] = hc_byte_perm_S (w[17], w[18], selector);
      w[55] = hc_byte_perm_S (w[16], w[17], selector);
      w[54] = hc_byte_perm_S (w[15], w[16], selector);
      w[53] = hc_byte_perm_S (w[14], w[15], selector);
      w[52] = hc_byte_perm_S (w[13], w[14], selector);
      w[51] = hc_byte_perm_S (w[12], w[13], selector);
      w[50] = hc_byte_perm_S (w[11], w[12], selector);
      w[49] = hc_byte_perm_S (w[10], w[11], selector);
      w[48] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[47] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[46] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[45] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[44] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[43] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[42] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[41] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[40] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[39] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[38] = hc_byte_perm_S (    0, w[ 0], selector);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_byte_perm_S (w[23], w[24], selector);
      w[62] = hc_byte_perm_S (w[22], w[23], selector);
      w[61] = hc_byte_perm_S (w[21], w[22], selector);
      w[60] = hc_byte_perm_S (w[20], w[21], selector);
      w[59] = hc_byte_perm_S (w[19], w[20], selector);
      w[58] = hc_byte_perm_S (w[18], w[19], selector);
      w[57] = hc_byte_perm_S (w[17], w[18], selector);
      w[56] = hc_byte_perm_S (w[16], w[17], selector);
      w[55] = hc_byte_perm_S (w[15], w[16], selector);
      w[54] = hc_byte_perm_S (w[14], w[15], selector);
      w[53] = hc_byte_perm_S (w[13], w[14], selector);
      w[52] = hc_byte_perm_S (w[12], w[13], selector);
      w[51] = hc_byte_perm_S (w[11], w[12], selector);
      w[50] = hc_byte_perm_S (w[10], w[11], selector);
      w[49] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[48] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[47] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[46] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[45] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[44] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[43] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[42] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[41] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[40] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[39] = hc_byte_perm_S (    0, w[ 0], selector);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_byte_perm_S (w[22], w[23], selector);
      w[62] = hc_byte_perm_S (w[21], w[22], selector);
      w[61] = hc_byte_perm_S (w[20], w[21], selector);
      w[60] = hc_byte_perm_S (w[19], w[20], selector);
      w[59] = hc_byte_perm_S (w[18], w[19], selector);
      w[58] = hc_byte_perm_S (w[17], w[18], selector);
      w[57] = hc_byte_perm_S (w[16], w[17], selector);
      w[56] = hc_byte_perm_S (w[15], w[16], selector);
      w[55] = hc_byte_perm_S (w[14], w[15], selector);
      w[54] = hc_byte_perm_S (w[13], w[14], selector);
      w[53] = hc_byte_perm_S (w[12], w[13], selector);
      w[52] = hc_byte_perm_S (w[11], w[12], selector);
      w[51] = hc_byte_perm_S (w[10], w[11], selector);
      w[50] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[49] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[48] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[47] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[46] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[45] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[44] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[43] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[42] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[41] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[40] = hc_byte_perm_S (    0, w[ 0], selector);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_byte_perm_S (w[21], w[22], selector);
      w[62] = hc_byte_perm_S (w[20], w[21], selector);
      w[61] = hc_byte_perm_S (w[19], w[20], selector);
      w[60] = hc_byte_perm_S (w[18], w[19], selector);
      w[59] = hc_byte_perm_S (w[17], w[18], selector);
      w[58] = hc_byte_perm_S (w[16], w[17], selector);
      w[57] = hc_byte_perm_S (w[15], w[16], selector);
      w[56] = hc_byte_perm_S (w[14], w[15], selector);
      w[55] = hc_byte_perm_S (w[13], w[14], selector);
      w[54] = hc_byte_perm_S (w[12], w[13], selector);
      w[53] = hc_byte_perm_S (w[11], w[12], selector);
      w[52] = hc_byte_perm_S (w[10], w[11], selector);
      w[51] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[50] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[49] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[48] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[47] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[46] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[45] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[44] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[43] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[42] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[41] = hc_byte_perm_S (    0, w[ 0], selector);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_byte_perm_S (w[20], w[21], selector);
      w[62] = hc_byte_perm_S (w[19], w[20], selector);
      w[61] = hc_byte_perm_S (w[18], w[19], selector);
      w[60] = hc_byte_perm_S (w[17], w[18], selector);
      w[59] = hc_byte_perm_S (w[16], w[17], selector);
      w[58] = hc_byte_perm_S (w[15], w[16], selector);
      w[57] = hc_byte_perm_S (w[14], w[15], selector);
      w[56] = hc_byte_perm_S (w[13], w[14], selector);
      w[55] = hc_byte_perm_S (w[12], w[13], selector);
      w[54] = hc_byte_perm_S (w[11], w[12], selector);
      w[53] = hc_byte_perm_S (w[10], w[11], selector);
      w[52] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[51] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[50] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[49] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[48] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[47] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[46] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[45] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[44] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[43] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[42] = hc_byte_perm_S (    0, w[ 0], selector);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_byte_perm_S (w[19], w[20], selector);
      w[62] = hc_byte_perm_S (w[18], w[19], selector);
      w[61] = hc_byte_perm_S (w[17], w[18], selector);
      w[60] = hc_byte_perm_S (w[16], w[17], selector);
      w[59] = hc_byte_perm_S (w[15], w[16], selector);
      w[58] = hc_byte_perm_S (w[14], w[15], selector);
      w[57] = hc_byte_perm_S (w[13], w[14], selector);
      w[56] = hc_byte_perm_S (w[12], w[13], selector);
      w[55] = hc_byte_perm_S (w[11], w[12], selector);
      w[54] = hc_byte_perm_S (w[10], w[11], selector);
      w[53] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[52] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[51] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[50] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[49] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[48] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[47] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[46] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[45] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[44] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[43] = hc_byte_perm_S (    0, w[ 0], selector);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_byte_perm_S (w[18], w[19], selector);
      w[62] = hc_byte_perm_S (w[17], w[18], selector);
      w[61] = hc_byte_perm_S (w[16], w[17], selector);
      w[60] = hc_byte_perm_S (w[15], w[16], selector);
      w[59] = hc_byte_perm_S (w[14], w[15], selector);
      w[58] = hc_byte_perm_S (w[13], w[14], selector);
      w[57] = hc_byte_perm_S (w[12], w[13], selector);
      w[56] = hc_byte_perm_S (w[11], w[12], selector);
      w[55] = hc_byte_perm_S (w[10], w[11], selector);
      w[54] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[53] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[52] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[51] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[50] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[49] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[48] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[47] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[46] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[45] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[44] = hc_byte_perm_S (    0, w[ 0], selector);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_byte_perm_S (w[17], w[18], selector);
      w[62] = hc_byte_perm_S (w[16], w[17], selector);
      w[61] = hc_byte_perm_S (w[15], w[16], selector);
      w[60] = hc_byte_perm_S (w[14], w[15], selector);
      w[59] = hc_byte_perm_S (w[13], w[14], selector);
      w[58] = hc_byte_perm_S (w[12], w[13], selector);
      w[57] = hc_byte_perm_S (w[11], w[12], selector);
      w[56] = hc_byte_perm_S (w[10], w[11], selector);
      w[55] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[54] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[53] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[52] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[51] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[50] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[49] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[48] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[47] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[46] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[45] = hc_byte_perm_S (    0, w[ 0], selector);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_byte_perm_S (w[16], w[17], selector);
      w[62] = hc_byte_perm_S (w[15], w[16], selector);
      w[61] = hc_byte_perm_S (w[14], w[15], selector);
      w[60] = hc_byte_perm_S (w[13], w[14], selector);
      w[59] = hc_byte_perm_S (w[12], w[13], selector);
      w[58] = hc_byte_perm_S (w[11], w[12], selector);
      w[57] = hc_byte_perm_S (w[10], w[11], selector);
      w[56] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[55] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[54] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[53] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[52] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[51] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[50] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[49] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[48] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[47] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[46] = hc_byte_perm_S (    0, w[ 0], selector);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_byte_perm_S (w[15], w[16], selector);
      w[62] = hc_byte_perm_S (w[14], w[15], selector);
      w[61] = hc_byte_perm_S (w[13], w[14], selector);
      w[60] = hc_byte_perm_S (w[12], w[13], selector);
      w[59] = hc_byte_perm_S (w[11], w[12], selector);
      w[58] = hc_byte_perm_S (w[10], w[11], selector);
      w[57] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[56] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[55] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[54] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[53] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[52] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[51] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[50] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[49] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[48] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[47] = hc_byte_perm_S (    0, w[ 0], selector);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_byte_perm_S (w[14], w[15], selector);
      w[62] = hc_byte_perm_S (w[13], w[14], selector);
      w[61] = hc_byte_perm_S (w[12], w[13], selector);
      w[60] = hc_byte_perm_S (w[11], w[12], selector);
      w[59] = hc_byte_perm_S (w[10], w[11], selector);
      w[58] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[57] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[56] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[55] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[54] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[53] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[52] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[51] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[50] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[49] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[48] = hc_byte_perm_S (    0, w[ 0], selector);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_byte_perm_S (w[13], w[14], selector);
      w[62] = hc_byte_perm_S (w[12], w[13], selector);
      w[61] = hc_byte_perm_S (w[11], w[12], selector);
      w[60] = hc_byte_perm_S (w[10], w[11], selector);
      w[59] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[58] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[57] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[56] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[55] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[54] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[53] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[52] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[51] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[50] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[49] = hc_byte_perm_S (    0, w[ 0], selector);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_byte_perm_S (w[12], w[13], selector);
      w[62] = hc_byte_perm_S (w[11], w[12], selector);
      w[61] = hc_byte_perm_S (w[10], w[11], selector);
      w[60] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[59] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[58] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[57] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[56] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[55] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[54] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[53] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[52] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[51] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[50] = hc_byte_perm_S (    0, w[ 0], selector);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_byte_perm_S (w[11], w[12], selector);
      w[62] = hc_byte_perm_S (w[10], w[11], selector);
      w[61] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[60] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[59] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[58] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[57] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[56] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[55] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[54] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[53] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[52] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[51] = hc_byte_perm_S (    0, w[ 0], selector);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_byte_perm_S (w[10], w[11], selector);
      w[62] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[61] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[60] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[59] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[58] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[57] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[56] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[55] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[54] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[53] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[52] = hc_byte_perm_S (    0, w[ 0], selector);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_byte_perm_S (w[ 9], w[10], selector);
      w[62] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[61] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[60] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[59] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[58] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[57] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[56] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[55] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[54] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[53] = hc_byte_perm_S (    0, w[ 0], selector);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_byte_perm_S (w[ 8], w[ 9], selector);
      w[62] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[61] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[60] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[59] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[58] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[57] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[56] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[55] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[54] = hc_byte_perm_S (    0, w[ 0], selector);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_byte_perm_S (w[ 7], w[ 8], selector);
      w[62] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[61] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[60] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[59] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[58] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[57] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[56] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[55] = hc_byte_perm_S (    0, w[ 0], selector);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_byte_perm_S (w[ 6], w[ 7], selector);
      w[62] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[61] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[60] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[59] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[58] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[57] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[56] = hc_byte_perm_S (    0, w[ 0], selector);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_byte_perm_S (w[ 5], w[ 6], selector);
      w[62] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[61] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[60] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[59] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[58] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[57] = hc_byte_perm_S (    0, w[ 0], selector);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_byte_perm_S (w[ 4], w[ 5], selector);
      w[62] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[61] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[60] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[59] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[58] = hc_byte_perm_S (    0, w[ 0], selector);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_byte_perm_S (w[ 3], w[ 4], selector);
      w[62] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[61] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[60] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[59] = hc_byte_perm_S (    0, w[ 0], selector);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_byte_perm_S (w[ 2], w[ 3], selector);
      w[62] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[61] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[60] = hc_byte_perm_S (    0, w[ 0], selector);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_byte_perm_S (w[ 1], w[ 2], selector);
      w[62] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[61] = hc_byte_perm_S (    0, w[ 0], selector);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_byte_perm_S (w[ 0], w[ 1], selector);
      w[62] = hc_byte_perm_S (    0, w[ 0], selector);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_byte_perm_S (    0, w[ 0], selector);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign_S (w[62], w[63], offset);
      w[62] = hc_bytealign_S (w[61], w[62], offset);
      w[61] = hc_bytealign_S (w[60], w[61], offset);
      w[60] = hc_bytealign_S (w[59], w[60], offset);
      w[59] = hc_bytealign_S (w[58], w[59], offset);
      w[58] = hc_bytealign_S (w[57], w[58], offset);
      w[57] = hc_bytealign_S (w[56], w[57], offset);
      w[56] = hc_bytealign_S (w[55], w[56], offset);
      w[55] = hc_bytealign_S (w[54], w[55], offset);
      w[54] = hc_bytealign_S (w[53], w[54], offset);
      w[53] = hc_bytealign_S (w[52], w[53], offset);
      w[52] = hc_bytealign_S (w[51], w[52], offset);
      w[51] = hc_bytealign_S (w[50], w[51], offset);
      w[50] = hc_bytealign_S (w[49], w[50], offset);
      w[49] = hc_bytealign_S (w[48], w[49], offset);
      w[48] = hc_bytealign_S (w[47], w[48], offset);
      w[47] = hc_bytealign_S (w[46], w[47], offset);
      w[46] = hc_bytealign_S (w[45], w[46], offset);
      w[45] = hc_bytealign_S (w[44], w[45], offset);
      w[44] = hc_bytealign_S (w[43], w[44], offset);
      w[43] = hc_bytealign_S (w[42], w[43], offset);
      w[42] = hc_bytealign_S (w[41], w[42], offset);
      w[41] = hc_bytealign_S (w[40], w[41], offset);
      w[40] = hc_bytealign_S (w[39], w[40], offset);
      w[39] = hc_bytealign_S (w[38], w[39], offset);
      w[38] = hc_bytealign_S (w[37], w[38], offset);
      w[37] = hc_bytealign_S (w[36], w[37], offset);
      w[36] = hc_bytealign_S (w[35], w[36], offset);
      w[35] = hc_bytealign_S (w[34], w[35], offset);
      w[34] = hc_bytealign_S (w[33], w[34], offset);
      w[33] = hc_bytealign_S (w[32], w[33], offset);
      w[32] = hc_bytealign_S (w[31], w[32], offset);
      w[31] = hc_bytealign_S (w[30], w[31], offset);
      w[30] = hc_bytealign_S (w[29], w[30], offset);
      w[29] = hc_bytealign_S (w[28], w[29], offset);
      w[28] = hc_bytealign_S (w[27], w[28], offset);
      w[27] = hc_bytealign_S (w[26], w[27], offset);
      w[26] = hc_bytealign_S (w[25], w[26], offset);
      w[25] = hc_bytealign_S (w[24], w[25], offset);
      w[24] = hc_bytealign_S (w[23], w[24], offset);
      w[23] = hc_bytealign_S (w[22], w[23], offset);
      w[22] = hc_bytealign_S (w[21], w[22], offset);
      w[21] = hc_bytealign_S (w[20], w[21], offset);
      w[20] = hc_bytealign_S (w[19], w[20], offset);
      w[19] = hc_bytealign_S (w[18], w[19], offset);
      w[18] = hc_bytealign_S (w[17], w[18], offset);
      w[17] = hc_bytealign_S (w[16], w[17], offset);
      w[16] = hc_bytealign_S (w[15], w[16], offset);
      w[15] = hc_bytealign_S (w[14], w[15], offset);
      w[14] = hc_bytealign_S (w[13], w[14], offset);
      w[13] = hc_bytealign_S (w[12], w[13], offset);
      w[12] = hc_bytealign_S (w[11], w[12], offset);
      w[11] = hc_bytealign_S (w[10], w[11], offset);
      w[10] = hc_bytealign_S (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign_S (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign_S (w[61], w[62], offset);
      w[62] = hc_bytealign_S (w[60], w[61], offset);
      w[61] = hc_bytealign_S (w[59], w[60], offset);
      w[60] = hc_bytealign_S (w[58], w[59], offset);
      w[59] = hc_bytealign_S (w[57], w[58], offset);
      w[58] = hc_bytealign_S (w[56], w[57], offset);
      w[57] = hc_bytealign_S (w[55], w[56], offset);
      w[56] = hc_bytealign_S (w[54], w[55], offset);
      w[55] = hc_bytealign_S (w[53], w[54], offset);
      w[54] = hc_bytealign_S (w[52], w[53], offset);
      w[53] = hc_bytealign_S (w[51], w[52], offset);
      w[52] = hc_bytealign_S (w[50], w[51], offset);
      w[51] = hc_bytealign_S (w[49], w[50], offset);
      w[50] = hc_bytealign_S (w[48], w[49], offset);
      w[49] = hc_bytealign_S (w[47], w[48], offset);
      w[48] = hc_bytealign_S (w[46], w[47], offset);
      w[47] = hc_bytealign_S (w[45], w[46], offset);
      w[46] = hc_bytealign_S (w[44], w[45], offset);
      w[45] = hc_bytealign_S (w[43], w[44], offset);
      w[44] = hc_bytealign_S (w[42], w[43], offset);
      w[43] = hc_bytealign_S (w[41], w[42], offset);
      w[42] = hc_bytealign_S (w[40], w[41], offset);
      w[41] = hc_bytealign_S (w[39], w[40], offset);
      w[40] = hc_bytealign_S (w[38], w[39], offset);
      w[39] = hc_bytealign_S (w[37], w[38], offset);
      w[38] = hc_bytealign_S (w[36], w[37], offset);
      w[37] = hc_bytealign_S (w[35], w[36], offset);
      w[36] = hc_bytealign_S (w[34], w[35], offset);
      w[35] = hc_bytealign_S (w[33], w[34], offset);
      w[34] = hc_bytealign_S (w[32], w[33], offset);
      w[33] = hc_bytealign_S (w[31], w[32], offset);
      w[32] = hc_bytealign_S (w[30], w[31], offset);
      w[31] = hc_bytealign_S (w[29], w[30], offset);
      w[30] = hc_bytealign_S (w[28], w[29], offset);
      w[29] = hc_bytealign_S (w[27], w[28], offset);
      w[28] = hc_bytealign_S (w[26], w[27], offset);
      w[27] = hc_bytealign_S (w[25], w[26], offset);
      w[26] = hc_bytealign_S (w[24], w[25], offset);
      w[25] = hc_bytealign_S (w[23], w[24], offset);
      w[24] = hc_bytealign_S (w[22], w[23], offset);
      w[23] = hc_bytealign_S (w[21], w[22], offset);
      w[22] = hc_bytealign_S (w[20], w[21], offset);
      w[21] = hc_bytealign_S (w[19], w[20], offset);
      w[20] = hc_bytealign_S (w[18], w[19], offset);
      w[19] = hc_bytealign_S (w[17], w[18], offset);
      w[18] = hc_bytealign_S (w[16], w[17], offset);
      w[17] = hc_bytealign_S (w[15], w[16], offset);
      w[16] = hc_bytealign_S (w[14], w[15], offset);
      w[15] = hc_bytealign_S (w[13], w[14], offset);
      w[14] = hc_bytealign_S (w[12], w[13], offset);
      w[13] = hc_bytealign_S (w[11], w[12], offset);
      w[12] = hc_bytealign_S (w[10], w[11], offset);
      w[11] = hc_bytealign_S (w[ 9], w[10], offset);
      w[10] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign_S (w[60], w[61], offset);
      w[62] = hc_bytealign_S (w[59], w[60], offset);
      w[61] = hc_bytealign_S (w[58], w[59], offset);
      w[60] = hc_bytealign_S (w[57], w[58], offset);
      w[59] = hc_bytealign_S (w[56], w[57], offset);
      w[58] = hc_bytealign_S (w[55], w[56], offset);
      w[57] = hc_bytealign_S (w[54], w[55], offset);
      w[56] = hc_bytealign_S (w[53], w[54], offset);
      w[55] = hc_bytealign_S (w[52], w[53], offset);
      w[54] = hc_bytealign_S (w[51], w[52], offset);
      w[53] = hc_bytealign_S (w[50], w[51], offset);
      w[52] = hc_bytealign_S (w[49], w[50], offset);
      w[51] = hc_bytealign_S (w[48], w[49], offset);
      w[50] = hc_bytealign_S (w[47], w[48], offset);
      w[49] = hc_bytealign_S (w[46], w[47], offset);
      w[48] = hc_bytealign_S (w[45], w[46], offset);
      w[47] = hc_bytealign_S (w[44], w[45], offset);
      w[46] = hc_bytealign_S (w[43], w[44], offset);
      w[45] = hc_bytealign_S (w[42], w[43], offset);
      w[44] = hc_bytealign_S (w[41], w[42], offset);
      w[43] = hc_bytealign_S (w[40], w[41], offset);
      w[42] = hc_bytealign_S (w[39], w[40], offset);
      w[41] = hc_bytealign_S (w[38], w[39], offset);
      w[40] = hc_bytealign_S (w[37], w[38], offset);
      w[39] = hc_bytealign_S (w[36], w[37], offset);
      w[38] = hc_bytealign_S (w[35], w[36], offset);
      w[37] = hc_bytealign_S (w[34], w[35], offset);
      w[36] = hc_bytealign_S (w[33], w[34], offset);
      w[35] = hc_bytealign_S (w[32], w[33], offset);
      w[34] = hc_bytealign_S (w[31], w[32], offset);
      w[33] = hc_bytealign_S (w[30], w[31], offset);
      w[32] = hc_bytealign_S (w[29], w[30], offset);
      w[31] = hc_bytealign_S (w[28], w[29], offset);
      w[30] = hc_bytealign_S (w[27], w[28], offset);
      w[29] = hc_bytealign_S (w[26], w[27], offset);
      w[28] = hc_bytealign_S (w[25], w[26], offset);
      w[27] = hc_bytealign_S (w[24], w[25], offset);
      w[26] = hc_bytealign_S (w[23], w[24], offset);
      w[25] = hc_bytealign_S (w[22], w[23], offset);
      w[24] = hc_bytealign_S (w[21], w[22], offset);
      w[23] = hc_bytealign_S (w[20], w[21], offset);
      w[22] = hc_bytealign_S (w[19], w[20], offset);
      w[21] = hc_bytealign_S (w[18], w[19], offset);
      w[20] = hc_bytealign_S (w[17], w[18], offset);
      w[19] = hc_bytealign_S (w[16], w[17], offset);
      w[18] = hc_bytealign_S (w[15], w[16], offset);
      w[17] = hc_bytealign_S (w[14], w[15], offset);
      w[16] = hc_bytealign_S (w[13], w[14], offset);
      w[15] = hc_bytealign_S (w[12], w[13], offset);
      w[14] = hc_bytealign_S (w[11], w[12], offset);
      w[13] = hc_bytealign_S (w[10], w[11], offset);
      w[12] = hc_bytealign_S (w[ 9], w[10], offset);
      w[11] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign_S (w[59], w[60], offset);
      w[62] = hc_bytealign_S (w[58], w[59], offset);
      w[61] = hc_bytealign_S (w[57], w[58], offset);
      w[60] = hc_bytealign_S (w[56], w[57], offset);
      w[59] = hc_bytealign_S (w[55], w[56], offset);
      w[58] = hc_bytealign_S (w[54], w[55], offset);
      w[57] = hc_bytealign_S (w[53], w[54], offset);
      w[56] = hc_bytealign_S (w[52], w[53], offset);
      w[55] = hc_bytealign_S (w[51], w[52], offset);
      w[54] = hc_bytealign_S (w[50], w[51], offset);
      w[53] = hc_bytealign_S (w[49], w[50], offset);
      w[52] = hc_bytealign_S (w[48], w[49], offset);
      w[51] = hc_bytealign_S (w[47], w[48], offset);
      w[50] = hc_bytealign_S (w[46], w[47], offset);
      w[49] = hc_bytealign_S (w[45], w[46], offset);
      w[48] = hc_bytealign_S (w[44], w[45], offset);
      w[47] = hc_bytealign_S (w[43], w[44], offset);
      w[46] = hc_bytealign_S (w[42], w[43], offset);
      w[45] = hc_bytealign_S (w[41], w[42], offset);
      w[44] = hc_bytealign_S (w[40], w[41], offset);
      w[43] = hc_bytealign_S (w[39], w[40], offset);
      w[42] = hc_bytealign_S (w[38], w[39], offset);
      w[41] = hc_bytealign_S (w[37], w[38], offset);
      w[40] = hc_bytealign_S (w[36], w[37], offset);
      w[39] = hc_bytealign_S (w[35], w[36], offset);
      w[38] = hc_bytealign_S (w[34], w[35], offset);
      w[37] = hc_bytealign_S (w[33], w[34], offset);
      w[36] = hc_bytealign_S (w[32], w[33], offset);
      w[35] = hc_bytealign_S (w[31], w[32], offset);
      w[34] = hc_bytealign_S (w[30], w[31], offset);
      w[33] = hc_bytealign_S (w[29], w[30], offset);
      w[32] = hc_bytealign_S (w[28], w[29], offset);
      w[31] = hc_bytealign_S (w[27], w[28], offset);
      w[30] = hc_bytealign_S (w[26], w[27], offset);
      w[29] = hc_bytealign_S (w[25], w[26], offset);
      w[28] = hc_bytealign_S (w[24], w[25], offset);
      w[27] = hc_bytealign_S (w[23], w[24], offset);
      w[26] = hc_bytealign_S (w[22], w[23], offset);
      w[25] = hc_bytealign_S (w[21], w[22], offset);
      w[24] = hc_bytealign_S (w[20], w[21], offset);
      w[23] = hc_bytealign_S (w[19], w[20], offset);
      w[22] = hc_bytealign_S (w[18], w[19], offset);
      w[21] = hc_bytealign_S (w[17], w[18], offset);
      w[20] = hc_bytealign_S (w[16], w[17], offset);
      w[19] = hc_bytealign_S (w[15], w[16], offset);
      w[18] = hc_bytealign_S (w[14], w[15], offset);
      w[17] = hc_bytealign_S (w[13], w[14], offset);
      w[16] = hc_bytealign_S (w[12], w[13], offset);
      w[15] = hc_bytealign_S (w[11], w[12], offset);
      w[14] = hc_bytealign_S (w[10], w[11], offset);
      w[13] = hc_bytealign_S (w[ 9], w[10], offset);
      w[12] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign_S (w[58], w[59], offset);
      w[62] = hc_bytealign_S (w[57], w[58], offset);
      w[61] = hc_bytealign_S (w[56], w[57], offset);
      w[60] = hc_bytealign_S (w[55], w[56], offset);
      w[59] = hc_bytealign_S (w[54], w[55], offset);
      w[58] = hc_bytealign_S (w[53], w[54], offset);
      w[57] = hc_bytealign_S (w[52], w[53], offset);
      w[56] = hc_bytealign_S (w[51], w[52], offset);
      w[55] = hc_bytealign_S (w[50], w[51], offset);
      w[54] = hc_bytealign_S (w[49], w[50], offset);
      w[53] = hc_bytealign_S (w[48], w[49], offset);
      w[52] = hc_bytealign_S (w[47], w[48], offset);
      w[51] = hc_bytealign_S (w[46], w[47], offset);
      w[50] = hc_bytealign_S (w[45], w[46], offset);
      w[49] = hc_bytealign_S (w[44], w[45], offset);
      w[48] = hc_bytealign_S (w[43], w[44], offset);
      w[47] = hc_bytealign_S (w[42], w[43], offset);
      w[46] = hc_bytealign_S (w[41], w[42], offset);
      w[45] = hc_bytealign_S (w[40], w[41], offset);
      w[44] = hc_bytealign_S (w[39], w[40], offset);
      w[43] = hc_bytealign_S (w[38], w[39], offset);
      w[42] = hc_bytealign_S (w[37], w[38], offset);
      w[41] = hc_bytealign_S (w[36], w[37], offset);
      w[40] = hc_bytealign_S (w[35], w[36], offset);
      w[39] = hc_bytealign_S (w[34], w[35], offset);
      w[38] = hc_bytealign_S (w[33], w[34], offset);
      w[37] = hc_bytealign_S (w[32], w[33], offset);
      w[36] = hc_bytealign_S (w[31], w[32], offset);
      w[35] = hc_bytealign_S (w[30], w[31], offset);
      w[34] = hc_bytealign_S (w[29], w[30], offset);
      w[33] = hc_bytealign_S (w[28], w[29], offset);
      w[32] = hc_bytealign_S (w[27], w[28], offset);
      w[31] = hc_bytealign_S (w[26], w[27], offset);
      w[30] = hc_bytealign_S (w[25], w[26], offset);
      w[29] = hc_bytealign_S (w[24], w[25], offset);
      w[28] = hc_bytealign_S (w[23], w[24], offset);
      w[27] = hc_bytealign_S (w[22], w[23], offset);
      w[26] = hc_bytealign_S (w[21], w[22], offset);
      w[25] = hc_bytealign_S (w[20], w[21], offset);
      w[24] = hc_bytealign_S (w[19], w[20], offset);
      w[23] = hc_bytealign_S (w[18], w[19], offset);
      w[22] = hc_bytealign_S (w[17], w[18], offset);
      w[21] = hc_bytealign_S (w[16], w[17], offset);
      w[20] = hc_bytealign_S (w[15], w[16], offset);
      w[19] = hc_bytealign_S (w[14], w[15], offset);
      w[18] = hc_bytealign_S (w[13], w[14], offset);
      w[17] = hc_bytealign_S (w[12], w[13], offset);
      w[16] = hc_bytealign_S (w[11], w[12], offset);
      w[15] = hc_bytealign_S (w[10], w[11], offset);
      w[14] = hc_bytealign_S (w[ 9], w[10], offset);
      w[13] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign_S (w[57], w[58], offset);
      w[62] = hc_bytealign_S (w[56], w[57], offset);
      w[61] = hc_bytealign_S (w[55], w[56], offset);
      w[60] = hc_bytealign_S (w[54], w[55], offset);
      w[59] = hc_bytealign_S (w[53], w[54], offset);
      w[58] = hc_bytealign_S (w[52], w[53], offset);
      w[57] = hc_bytealign_S (w[51], w[52], offset);
      w[56] = hc_bytealign_S (w[50], w[51], offset);
      w[55] = hc_bytealign_S (w[49], w[50], offset);
      w[54] = hc_bytealign_S (w[48], w[49], offset);
      w[53] = hc_bytealign_S (w[47], w[48], offset);
      w[52] = hc_bytealign_S (w[46], w[47], offset);
      w[51] = hc_bytealign_S (w[45], w[46], offset);
      w[50] = hc_bytealign_S (w[44], w[45], offset);
      w[49] = hc_bytealign_S (w[43], w[44], offset);
      w[48] = hc_bytealign_S (w[42], w[43], offset);
      w[47] = hc_bytealign_S (w[41], w[42], offset);
      w[46] = hc_bytealign_S (w[40], w[41], offset);
      w[45] = hc_bytealign_S (w[39], w[40], offset);
      w[44] = hc_bytealign_S (w[38], w[39], offset);
      w[43] = hc_bytealign_S (w[37], w[38], offset);
      w[42] = hc_bytealign_S (w[36], w[37], offset);
      w[41] = hc_bytealign_S (w[35], w[36], offset);
      w[40] = hc_bytealign_S (w[34], w[35], offset);
      w[39] = hc_bytealign_S (w[33], w[34], offset);
      w[38] = hc_bytealign_S (w[32], w[33], offset);
      w[37] = hc_bytealign_S (w[31], w[32], offset);
      w[36] = hc_bytealign_S (w[30], w[31], offset);
      w[35] = hc_bytealign_S (w[29], w[30], offset);
      w[34] = hc_bytealign_S (w[28], w[29], offset);
      w[33] = hc_bytealign_S (w[27], w[28], offset);
      w[32] = hc_bytealign_S (w[26], w[27], offset);
      w[31] = hc_bytealign_S (w[25], w[26], offset);
      w[30] = hc_bytealign_S (w[24], w[25], offset);
      w[29] = hc_bytealign_S (w[23], w[24], offset);
      w[28] = hc_bytealign_S (w[22], w[23], offset);
      w[27] = hc_bytealign_S (w[21], w[22], offset);
      w[26] = hc_bytealign_S (w[20], w[21], offset);
      w[25] = hc_bytealign_S (w[19], w[20], offset);
      w[24] = hc_bytealign_S (w[18], w[19], offset);
      w[23] = hc_bytealign_S (w[17], w[18], offset);
      w[22] = hc_bytealign_S (w[16], w[17], offset);
      w[21] = hc_bytealign_S (w[15], w[16], offset);
      w[20] = hc_bytealign_S (w[14], w[15], offset);
      w[19] = hc_bytealign_S (w[13], w[14], offset);
      w[18] = hc_bytealign_S (w[12], w[13], offset);
      w[17] = hc_bytealign_S (w[11], w[12], offset);
      w[16] = hc_bytealign_S (w[10], w[11], offset);
      w[15] = hc_bytealign_S (w[ 9], w[10], offset);
      w[14] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign_S (w[56], w[57], offset);
      w[62] = hc_bytealign_S (w[55], w[56], offset);
      w[61] = hc_bytealign_S (w[54], w[55], offset);
      w[60] = hc_bytealign_S (w[53], w[54], offset);
      w[59] = hc_bytealign_S (w[52], w[53], offset);
      w[58] = hc_bytealign_S (w[51], w[52], offset);
      w[57] = hc_bytealign_S (w[50], w[51], offset);
      w[56] = hc_bytealign_S (w[49], w[50], offset);
      w[55] = hc_bytealign_S (w[48], w[49], offset);
      w[54] = hc_bytealign_S (w[47], w[48], offset);
      w[53] = hc_bytealign_S (w[46], w[47], offset);
      w[52] = hc_bytealign_S (w[45], w[46], offset);
      w[51] = hc_bytealign_S (w[44], w[45], offset);
      w[50] = hc_bytealign_S (w[43], w[44], offset);
      w[49] = hc_bytealign_S (w[42], w[43], offset);
      w[48] = hc_bytealign_S (w[41], w[42], offset);
      w[47] = hc_bytealign_S (w[40], w[41], offset);
      w[46] = hc_bytealign_S (w[39], w[40], offset);
      w[45] = hc_bytealign_S (w[38], w[39], offset);
      w[44] = hc_bytealign_S (w[37], w[38], offset);
      w[43] = hc_bytealign_S (w[36], w[37], offset);
      w[42] = hc_bytealign_S (w[35], w[36], offset);
      w[41] = hc_bytealign_S (w[34], w[35], offset);
      w[40] = hc_bytealign_S (w[33], w[34], offset);
      w[39] = hc_bytealign_S (w[32], w[33], offset);
      w[38] = hc_bytealign_S (w[31], w[32], offset);
      w[37] = hc_bytealign_S (w[30], w[31], offset);
      w[36] = hc_bytealign_S (w[29], w[30], offset);
      w[35] = hc_bytealign_S (w[28], w[29], offset);
      w[34] = hc_bytealign_S (w[27], w[28], offset);
      w[33] = hc_bytealign_S (w[26], w[27], offset);
      w[32] = hc_bytealign_S (w[25], w[26], offset);
      w[31] = hc_bytealign_S (w[24], w[25], offset);
      w[30] = hc_bytealign_S (w[23], w[24], offset);
      w[29] = hc_bytealign_S (w[22], w[23], offset);
      w[28] = hc_bytealign_S (w[21], w[22], offset);
      w[27] = hc_bytealign_S (w[20], w[21], offset);
      w[26] = hc_bytealign_S (w[19], w[20], offset);
      w[25] = hc_bytealign_S (w[18], w[19], offset);
      w[24] = hc_bytealign_S (w[17], w[18], offset);
      w[23] = hc_bytealign_S (w[16], w[17], offset);
      w[22] = hc_bytealign_S (w[15], w[16], offset);
      w[21] = hc_bytealign_S (w[14], w[15], offset);
      w[20] = hc_bytealign_S (w[13], w[14], offset);
      w[19] = hc_bytealign_S (w[12], w[13], offset);
      w[18] = hc_bytealign_S (w[11], w[12], offset);
      w[17] = hc_bytealign_S (w[10], w[11], offset);
      w[16] = hc_bytealign_S (w[ 9], w[10], offset);
      w[15] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign_S (w[55], w[56], offset);
      w[62] = hc_bytealign_S (w[54], w[55], offset);
      w[61] = hc_bytealign_S (w[53], w[54], offset);
      w[60] = hc_bytealign_S (w[52], w[53], offset);
      w[59] = hc_bytealign_S (w[51], w[52], offset);
      w[58] = hc_bytealign_S (w[50], w[51], offset);
      w[57] = hc_bytealign_S (w[49], w[50], offset);
      w[56] = hc_bytealign_S (w[48], w[49], offset);
      w[55] = hc_bytealign_S (w[47], w[48], offset);
      w[54] = hc_bytealign_S (w[46], w[47], offset);
      w[53] = hc_bytealign_S (w[45], w[46], offset);
      w[52] = hc_bytealign_S (w[44], w[45], offset);
      w[51] = hc_bytealign_S (w[43], w[44], offset);
      w[50] = hc_bytealign_S (w[42], w[43], offset);
      w[49] = hc_bytealign_S (w[41], w[42], offset);
      w[48] = hc_bytealign_S (w[40], w[41], offset);
      w[47] = hc_bytealign_S (w[39], w[40], offset);
      w[46] = hc_bytealign_S (w[38], w[39], offset);
      w[45] = hc_bytealign_S (w[37], w[38], offset);
      w[44] = hc_bytealign_S (w[36], w[37], offset);
      w[43] = hc_bytealign_S (w[35], w[36], offset);
      w[42] = hc_bytealign_S (w[34], w[35], offset);
      w[41] = hc_bytealign_S (w[33], w[34], offset);
      w[40] = hc_bytealign_S (w[32], w[33], offset);
      w[39] = hc_bytealign_S (w[31], w[32], offset);
      w[38] = hc_bytealign_S (w[30], w[31], offset);
      w[37] = hc_bytealign_S (w[29], w[30], offset);
      w[36] = hc_bytealign_S (w[28], w[29], offset);
      w[35] = hc_bytealign_S (w[27], w[28], offset);
      w[34] = hc_bytealign_S (w[26], w[27], offset);
      w[33] = hc_bytealign_S (w[25], w[26], offset);
      w[32] = hc_bytealign_S (w[24], w[25], offset);
      w[31] = hc_bytealign_S (w[23], w[24], offset);
      w[30] = hc_bytealign_S (w[22], w[23], offset);
      w[29] = hc_bytealign_S (w[21], w[22], offset);
      w[28] = hc_bytealign_S (w[20], w[21], offset);
      w[27] = hc_bytealign_S (w[19], w[20], offset);
      w[26] = hc_bytealign_S (w[18], w[19], offset);
      w[25] = hc_bytealign_S (w[17], w[18], offset);
      w[24] = hc_bytealign_S (w[16], w[17], offset);
      w[23] = hc_bytealign_S (w[15], w[16], offset);
      w[22] = hc_bytealign_S (w[14], w[15], offset);
      w[21] = hc_bytealign_S (w[13], w[14], offset);
      w[20] = hc_bytealign_S (w[12], w[13], offset);
      w[19] = hc_bytealign_S (w[11], w[12], offset);
      w[18] = hc_bytealign_S (w[10], w[11], offset);
      w[17] = hc_bytealign_S (w[ 9], w[10], offset);
      w[16] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign_S (w[54], w[55], offset);
      w[62] = hc_bytealign_S (w[53], w[54], offset);
      w[61] = hc_bytealign_S (w[52], w[53], offset);
      w[60] = hc_bytealign_S (w[51], w[52], offset);
      w[59] = hc_bytealign_S (w[50], w[51], offset);
      w[58] = hc_bytealign_S (w[49], w[50], offset);
      w[57] = hc_bytealign_S (w[48], w[49], offset);
      w[56] = hc_bytealign_S (w[47], w[48], offset);
      w[55] = hc_bytealign_S (w[46], w[47], offset);
      w[54] = hc_bytealign_S (w[45], w[46], offset);
      w[53] = hc_bytealign_S (w[44], w[45], offset);
      w[52] = hc_bytealign_S (w[43], w[44], offset);
      w[51] = hc_bytealign_S (w[42], w[43], offset);
      w[50] = hc_bytealign_S (w[41], w[42], offset);
      w[49] = hc_bytealign_S (w[40], w[41], offset);
      w[48] = hc_bytealign_S (w[39], w[40], offset);
      w[47] = hc_bytealign_S (w[38], w[39], offset);
      w[46] = hc_bytealign_S (w[37], w[38], offset);
      w[45] = hc_bytealign_S (w[36], w[37], offset);
      w[44] = hc_bytealign_S (w[35], w[36], offset);
      w[43] = hc_bytealign_S (w[34], w[35], offset);
      w[42] = hc_bytealign_S (w[33], w[34], offset);
      w[41] = hc_bytealign_S (w[32], w[33], offset);
      w[40] = hc_bytealign_S (w[31], w[32], offset);
      w[39] = hc_bytealign_S (w[30], w[31], offset);
      w[38] = hc_bytealign_S (w[29], w[30], offset);
      w[37] = hc_bytealign_S (w[28], w[29], offset);
      w[36] = hc_bytealign_S (w[27], w[28], offset);
      w[35] = hc_bytealign_S (w[26], w[27], offset);
      w[34] = hc_bytealign_S (w[25], w[26], offset);
      w[33] = hc_bytealign_S (w[24], w[25], offset);
      w[32] = hc_bytealign_S (w[23], w[24], offset);
      w[31] = hc_bytealign_S (w[22], w[23], offset);
      w[30] = hc_bytealign_S (w[21], w[22], offset);
      w[29] = hc_bytealign_S (w[20], w[21], offset);
      w[28] = hc_bytealign_S (w[19], w[20], offset);
      w[27] = hc_bytealign_S (w[18], w[19], offset);
      w[26] = hc_bytealign_S (w[17], w[18], offset);
      w[25] = hc_bytealign_S (w[16], w[17], offset);
      w[24] = hc_bytealign_S (w[15], w[16], offset);
      w[23] = hc_bytealign_S (w[14], w[15], offset);
      w[22] = hc_bytealign_S (w[13], w[14], offset);
      w[21] = hc_bytealign_S (w[12], w[13], offset);
      w[20] = hc_bytealign_S (w[11], w[12], offset);
      w[19] = hc_bytealign_S (w[10], w[11], offset);
      w[18] = hc_bytealign_S (w[ 9], w[10], offset);
      w[17] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign_S (w[53], w[54], offset);
      w[62] = hc_bytealign_S (w[52], w[53], offset);
      w[61] = hc_bytealign_S (w[51], w[52], offset);
      w[60] = hc_bytealign_S (w[50], w[51], offset);
      w[59] = hc_bytealign_S (w[49], w[50], offset);
      w[58] = hc_bytealign_S (w[48], w[49], offset);
      w[57] = hc_bytealign_S (w[47], w[48], offset);
      w[56] = hc_bytealign_S (w[46], w[47], offset);
      w[55] = hc_bytealign_S (w[45], w[46], offset);
      w[54] = hc_bytealign_S (w[44], w[45], offset);
      w[53] = hc_bytealign_S (w[43], w[44], offset);
      w[52] = hc_bytealign_S (w[42], w[43], offset);
      w[51] = hc_bytealign_S (w[41], w[42], offset);
      w[50] = hc_bytealign_S (w[40], w[41], offset);
      w[49] = hc_bytealign_S (w[39], w[40], offset);
      w[48] = hc_bytealign_S (w[38], w[39], offset);
      w[47] = hc_bytealign_S (w[37], w[38], offset);
      w[46] = hc_bytealign_S (w[36], w[37], offset);
      w[45] = hc_bytealign_S (w[35], w[36], offset);
      w[44] = hc_bytealign_S (w[34], w[35], offset);
      w[43] = hc_bytealign_S (w[33], w[34], offset);
      w[42] = hc_bytealign_S (w[32], w[33], offset);
      w[41] = hc_bytealign_S (w[31], w[32], offset);
      w[40] = hc_bytealign_S (w[30], w[31], offset);
      w[39] = hc_bytealign_S (w[29], w[30], offset);
      w[38] = hc_bytealign_S (w[28], w[29], offset);
      w[37] = hc_bytealign_S (w[27], w[28], offset);
      w[36] = hc_bytealign_S (w[26], w[27], offset);
      w[35] = hc_bytealign_S (w[25], w[26], offset);
      w[34] = hc_bytealign_S (w[24], w[25], offset);
      w[33] = hc_bytealign_S (w[23], w[24], offset);
      w[32] = hc_bytealign_S (w[22], w[23], offset);
      w[31] = hc_bytealign_S (w[21], w[22], offset);
      w[30] = hc_bytealign_S (w[20], w[21], offset);
      w[29] = hc_bytealign_S (w[19], w[20], offset);
      w[28] = hc_bytealign_S (w[18], w[19], offset);
      w[27] = hc_bytealign_S (w[17], w[18], offset);
      w[26] = hc_bytealign_S (w[16], w[17], offset);
      w[25] = hc_bytealign_S (w[15], w[16], offset);
      w[24] = hc_bytealign_S (w[14], w[15], offset);
      w[23] = hc_bytealign_S (w[13], w[14], offset);
      w[22] = hc_bytealign_S (w[12], w[13], offset);
      w[21] = hc_bytealign_S (w[11], w[12], offset);
      w[20] = hc_bytealign_S (w[10], w[11], offset);
      w[19] = hc_bytealign_S (w[ 9], w[10], offset);
      w[18] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign_S (w[52], w[53], offset);
      w[62] = hc_bytealign_S (w[51], w[52], offset);
      w[61] = hc_bytealign_S (w[50], w[51], offset);
      w[60] = hc_bytealign_S (w[49], w[50], offset);
      w[59] = hc_bytealign_S (w[48], w[49], offset);
      w[58] = hc_bytealign_S (w[47], w[48], offset);
      w[57] = hc_bytealign_S (w[46], w[47], offset);
      w[56] = hc_bytealign_S (w[45], w[46], offset);
      w[55] = hc_bytealign_S (w[44], w[45], offset);
      w[54] = hc_bytealign_S (w[43], w[44], offset);
      w[53] = hc_bytealign_S (w[42], w[43], offset);
      w[52] = hc_bytealign_S (w[41], w[42], offset);
      w[51] = hc_bytealign_S (w[40], w[41], offset);
      w[50] = hc_bytealign_S (w[39], w[40], offset);
      w[49] = hc_bytealign_S (w[38], w[39], offset);
      w[48] = hc_bytealign_S (w[37], w[38], offset);
      w[47] = hc_bytealign_S (w[36], w[37], offset);
      w[46] = hc_bytealign_S (w[35], w[36], offset);
      w[45] = hc_bytealign_S (w[34], w[35], offset);
      w[44] = hc_bytealign_S (w[33], w[34], offset);
      w[43] = hc_bytealign_S (w[32], w[33], offset);
      w[42] = hc_bytealign_S (w[31], w[32], offset);
      w[41] = hc_bytealign_S (w[30], w[31], offset);
      w[40] = hc_bytealign_S (w[29], w[30], offset);
      w[39] = hc_bytealign_S (w[28], w[29], offset);
      w[38] = hc_bytealign_S (w[27], w[28], offset);
      w[37] = hc_bytealign_S (w[26], w[27], offset);
      w[36] = hc_bytealign_S (w[25], w[26], offset);
      w[35] = hc_bytealign_S (w[24], w[25], offset);
      w[34] = hc_bytealign_S (w[23], w[24], offset);
      w[33] = hc_bytealign_S (w[22], w[23], offset);
      w[32] = hc_bytealign_S (w[21], w[22], offset);
      w[31] = hc_bytealign_S (w[20], w[21], offset);
      w[30] = hc_bytealign_S (w[19], w[20], offset);
      w[29] = hc_bytealign_S (w[18], w[19], offset);
      w[28] = hc_bytealign_S (w[17], w[18], offset);
      w[27] = hc_bytealign_S (w[16], w[17], offset);
      w[26] = hc_bytealign_S (w[15], w[16], offset);
      w[25] = hc_bytealign_S (w[14], w[15], offset);
      w[24] = hc_bytealign_S (w[13], w[14], offset);
      w[23] = hc_bytealign_S (w[12], w[13], offset);
      w[22] = hc_bytealign_S (w[11], w[12], offset);
      w[21] = hc_bytealign_S (w[10], w[11], offset);
      w[20] = hc_bytealign_S (w[ 9], w[10], offset);
      w[19] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign_S (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign_S (w[51], w[52], offset);
      w[62] = hc_bytealign_S (w[50], w[51], offset);
      w[61] = hc_bytealign_S (w[49], w[50], offset);
      w[60] = hc_bytealign_S (w[48], w[49], offset);
      w[59] = hc_bytealign_S (w[47], w[48], offset);
      w[58] = hc_bytealign_S (w[46], w[47], offset);
      w[57] = hc_bytealign_S (w[45], w[46], offset);
      w[56] = hc_bytealign_S (w[44], w[45], offset);
      w[55] = hc_bytealign_S (w[43], w[44], offset);
      w[54] = hc_bytealign_S (w[42], w[43], offset);
      w[53] = hc_bytealign_S (w[41], w[42], offset);
      w[52] = hc_bytealign_S (w[40], w[41], offset);
      w[51] = hc_bytealign_S (w[39], w[40], offset);
      w[50] = hc_bytealign_S (w[38], w[39], offset);
      w[49] = hc_bytealign_S (w[37], w[38], offset);
      w[48] = hc_bytealign_S (w[36], w[37], offset);
      w[47] = hc_bytealign_S (w[35], w[36], offset);
      w[46] = hc_bytealign_S (w[34], w[35], offset);
      w[45] = hc_bytealign_S (w[33], w[34], offset);
      w[44] = hc_bytealign_S (w[32], w[33], offset);
      w[43] = hc_bytealign_S (w[31], w[32], offset);
      w[42] = hc_bytealign_S (w[30], w[31], offset);
      w[41] = hc_bytealign_S (w[29], w[30], offset);
      w[40] = hc_bytealign_S (w[28], w[29], offset);
      w[39] = hc_bytealign_S (w[27], w[28], offset);
      w[38] = hc_bytealign_S (w[26], w[27], offset);
      w[37] = hc_bytealign_S (w[25], w[26], offset);
      w[36] = hc_bytealign_S (w[24], w[25], offset);
      w[35] = hc_bytealign_S (w[23], w[24], offset);
      w[34] = hc_bytealign_S (w[22], w[23], offset);
      w[33] = hc_bytealign_S (w[21], w[22], offset);
      w[32] = hc_bytealign_S (w[20], w[21], offset);
      w[31] = hc_bytealign_S (w[19], w[20], offset);
      w[30] = hc_bytealign_S (w[18], w[19], offset);
      w[29] = hc_bytealign_S (w[17], w[18], offset);
      w[28] = hc_bytealign_S (w[16], w[17], offset);
      w[27] = hc_bytealign_S (w[15], w[16], offset);
      w[26] = hc_bytealign_S (w[14], w[15], offset);
      w[25] = hc_bytealign_S (w[13], w[14], offset);
      w[24] = hc_bytealign_S (w[12], w[13], offset);
      w[23] = hc_bytealign_S (w[11], w[12], offset);
      w[22] = hc_bytealign_S (w[10], w[11], offset);
      w[21] = hc_bytealign_S (w[ 9], w[10], offset);
      w[20] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign_S (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign_S (w[50], w[51], offset);
      w[62] = hc_bytealign_S (w[49], w[50], offset);
      w[61] = hc_bytealign_S (w[48], w[49], offset);
      w[60] = hc_bytealign_S (w[47], w[48], offset);
      w[59] = hc_bytealign_S (w[46], w[47], offset);
      w[58] = hc_bytealign_S (w[45], w[46], offset);
      w[57] = hc_bytealign_S (w[44], w[45], offset);
      w[56] = hc_bytealign_S (w[43], w[44], offset);
      w[55] = hc_bytealign_S (w[42], w[43], offset);
      w[54] = hc_bytealign_S (w[41], w[42], offset);
      w[53] = hc_bytealign_S (w[40], w[41], offset);
      w[52] = hc_bytealign_S (w[39], w[40], offset);
      w[51] = hc_bytealign_S (w[38], w[39], offset);
      w[50] = hc_bytealign_S (w[37], w[38], offset);
      w[49] = hc_bytealign_S (w[36], w[37], offset);
      w[48] = hc_bytealign_S (w[35], w[36], offset);
      w[47] = hc_bytealign_S (w[34], w[35], offset);
      w[46] = hc_bytealign_S (w[33], w[34], offset);
      w[45] = hc_bytealign_S (w[32], w[33], offset);
      w[44] = hc_bytealign_S (w[31], w[32], offset);
      w[43] = hc_bytealign_S (w[30], w[31], offset);
      w[42] = hc_bytealign_S (w[29], w[30], offset);
      w[41] = hc_bytealign_S (w[28], w[29], offset);
      w[40] = hc_bytealign_S (w[27], w[28], offset);
      w[39] = hc_bytealign_S (w[26], w[27], offset);
      w[38] = hc_bytealign_S (w[25], w[26], offset);
      w[37] = hc_bytealign_S (w[24], w[25], offset);
      w[36] = hc_bytealign_S (w[23], w[24], offset);
      w[35] = hc_bytealign_S (w[22], w[23], offset);
      w[34] = hc_bytealign_S (w[21], w[22], offset);
      w[33] = hc_bytealign_S (w[20], w[21], offset);
      w[32] = hc_bytealign_S (w[19], w[20], offset);
      w[31] = hc_bytealign_S (w[18], w[19], offset);
      w[30] = hc_bytealign_S (w[17], w[18], offset);
      w[29] = hc_bytealign_S (w[16], w[17], offset);
      w[28] = hc_bytealign_S (w[15], w[16], offset);
      w[27] = hc_bytealign_S (w[14], w[15], offset);
      w[26] = hc_bytealign_S (w[13], w[14], offset);
      w[25] = hc_bytealign_S (w[12], w[13], offset);
      w[24] = hc_bytealign_S (w[11], w[12], offset);
      w[23] = hc_bytealign_S (w[10], w[11], offset);
      w[22] = hc_bytealign_S (w[ 9], w[10], offset);
      w[21] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign_S (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign_S (w[49], w[50], offset);
      w[62] = hc_bytealign_S (w[48], w[49], offset);
      w[61] = hc_bytealign_S (w[47], w[48], offset);
      w[60] = hc_bytealign_S (w[46], w[47], offset);
      w[59] = hc_bytealign_S (w[45], w[46], offset);
      w[58] = hc_bytealign_S (w[44], w[45], offset);
      w[57] = hc_bytealign_S (w[43], w[44], offset);
      w[56] = hc_bytealign_S (w[42], w[43], offset);
      w[55] = hc_bytealign_S (w[41], w[42], offset);
      w[54] = hc_bytealign_S (w[40], w[41], offset);
      w[53] = hc_bytealign_S (w[39], w[40], offset);
      w[52] = hc_bytealign_S (w[38], w[39], offset);
      w[51] = hc_bytealign_S (w[37], w[38], offset);
      w[50] = hc_bytealign_S (w[36], w[37], offset);
      w[49] = hc_bytealign_S (w[35], w[36], offset);
      w[48] = hc_bytealign_S (w[34], w[35], offset);
      w[47] = hc_bytealign_S (w[33], w[34], offset);
      w[46] = hc_bytealign_S (w[32], w[33], offset);
      w[45] = hc_bytealign_S (w[31], w[32], offset);
      w[44] = hc_bytealign_S (w[30], w[31], offset);
      w[43] = hc_bytealign_S (w[29], w[30], offset);
      w[42] = hc_bytealign_S (w[28], w[29], offset);
      w[41] = hc_bytealign_S (w[27], w[28], offset);
      w[40] = hc_bytealign_S (w[26], w[27], offset);
      w[39] = hc_bytealign_S (w[25], w[26], offset);
      w[38] = hc_bytealign_S (w[24], w[25], offset);
      w[37] = hc_bytealign_S (w[23], w[24], offset);
      w[36] = hc_bytealign_S (w[22], w[23], offset);
      w[35] = hc_bytealign_S (w[21], w[22], offset);
      w[34] = hc_bytealign_S (w[20], w[21], offset);
      w[33] = hc_bytealign_S (w[19], w[20], offset);
      w[32] = hc_bytealign_S (w[18], w[19], offset);
      w[31] = hc_bytealign_S (w[17], w[18], offset);
      w[30] = hc_bytealign_S (w[16], w[17], offset);
      w[29] = hc_bytealign_S (w[15], w[16], offset);
      w[28] = hc_bytealign_S (w[14], w[15], offset);
      w[27] = hc_bytealign_S (w[13], w[14], offset);
      w[26] = hc_bytealign_S (w[12], w[13], offset);
      w[25] = hc_bytealign_S (w[11], w[12], offset);
      w[24] = hc_bytealign_S (w[10], w[11], offset);
      w[23] = hc_bytealign_S (w[ 9], w[10], offset);
      w[22] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign_S (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign_S (w[48], w[49], offset);
      w[62] = hc_bytealign_S (w[47], w[48], offset);
      w[61] = hc_bytealign_S (w[46], w[47], offset);
      w[60] = hc_bytealign_S (w[45], w[46], offset);
      w[59] = hc_bytealign_S (w[44], w[45], offset);
      w[58] = hc_bytealign_S (w[43], w[44], offset);
      w[57] = hc_bytealign_S (w[42], w[43], offset);
      w[56] = hc_bytealign_S (w[41], w[42], offset);
      w[55] = hc_bytealign_S (w[40], w[41], offset);
      w[54] = hc_bytealign_S (w[39], w[40], offset);
      w[53] = hc_bytealign_S (w[38], w[39], offset);
      w[52] = hc_bytealign_S (w[37], w[38], offset);
      w[51] = hc_bytealign_S (w[36], w[37], offset);
      w[50] = hc_bytealign_S (w[35], w[36], offset);
      w[49] = hc_bytealign_S (w[34], w[35], offset);
      w[48] = hc_bytealign_S (w[33], w[34], offset);
      w[47] = hc_bytealign_S (w[32], w[33], offset);
      w[46] = hc_bytealign_S (w[31], w[32], offset);
      w[45] = hc_bytealign_S (w[30], w[31], offset);
      w[44] = hc_bytealign_S (w[29], w[30], offset);
      w[43] = hc_bytealign_S (w[28], w[29], offset);
      w[42] = hc_bytealign_S (w[27], w[28], offset);
      w[41] = hc_bytealign_S (w[26], w[27], offset);
      w[40] = hc_bytealign_S (w[25], w[26], offset);
      w[39] = hc_bytealign_S (w[24], w[25], offset);
      w[38] = hc_bytealign_S (w[23], w[24], offset);
      w[37] = hc_bytealign_S (w[22], w[23], offset);
      w[36] = hc_bytealign_S (w[21], w[22], offset);
      w[35] = hc_bytealign_S (w[20], w[21], offset);
      w[34] = hc_bytealign_S (w[19], w[20], offset);
      w[33] = hc_bytealign_S (w[18], w[19], offset);
      w[32] = hc_bytealign_S (w[17], w[18], offset);
      w[31] = hc_bytealign_S (w[16], w[17], offset);
      w[30] = hc_bytealign_S (w[15], w[16], offset);
      w[29] = hc_bytealign_S (w[14], w[15], offset);
      w[28] = hc_bytealign_S (w[13], w[14], offset);
      w[27] = hc_bytealign_S (w[12], w[13], offset);
      w[26] = hc_bytealign_S (w[11], w[12], offset);
      w[25] = hc_bytealign_S (w[10], w[11], offset);
      w[24] = hc_bytealign_S (w[ 9], w[10], offset);
      w[23] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign_S (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign_S (w[47], w[48], offset);
      w[62] = hc_bytealign_S (w[46], w[47], offset);
      w[61] = hc_bytealign_S (w[45], w[46], offset);
      w[60] = hc_bytealign_S (w[44], w[45], offset);
      w[59] = hc_bytealign_S (w[43], w[44], offset);
      w[58] = hc_bytealign_S (w[42], w[43], offset);
      w[57] = hc_bytealign_S (w[41], w[42], offset);
      w[56] = hc_bytealign_S (w[40], w[41], offset);
      w[55] = hc_bytealign_S (w[39], w[40], offset);
      w[54] = hc_bytealign_S (w[38], w[39], offset);
      w[53] = hc_bytealign_S (w[37], w[38], offset);
      w[52] = hc_bytealign_S (w[36], w[37], offset);
      w[51] = hc_bytealign_S (w[35], w[36], offset);
      w[50] = hc_bytealign_S (w[34], w[35], offset);
      w[49] = hc_bytealign_S (w[33], w[34], offset);
      w[48] = hc_bytealign_S (w[32], w[33], offset);
      w[47] = hc_bytealign_S (w[31], w[32], offset);
      w[46] = hc_bytealign_S (w[30], w[31], offset);
      w[45] = hc_bytealign_S (w[29], w[30], offset);
      w[44] = hc_bytealign_S (w[28], w[29], offset);
      w[43] = hc_bytealign_S (w[27], w[28], offset);
      w[42] = hc_bytealign_S (w[26], w[27], offset);
      w[41] = hc_bytealign_S (w[25], w[26], offset);
      w[40] = hc_bytealign_S (w[24], w[25], offset);
      w[39] = hc_bytealign_S (w[23], w[24], offset);
      w[38] = hc_bytealign_S (w[22], w[23], offset);
      w[37] = hc_bytealign_S (w[21], w[22], offset);
      w[36] = hc_bytealign_S (w[20], w[21], offset);
      w[35] = hc_bytealign_S (w[19], w[20], offset);
      w[34] = hc_bytealign_S (w[18], w[19], offset);
      w[33] = hc_bytealign_S (w[17], w[18], offset);
      w[32] = hc_bytealign_S (w[16], w[17], offset);
      w[31] = hc_bytealign_S (w[15], w[16], offset);
      w[30] = hc_bytealign_S (w[14], w[15], offset);
      w[29] = hc_bytealign_S (w[13], w[14], offset);
      w[28] = hc_bytealign_S (w[12], w[13], offset);
      w[27] = hc_bytealign_S (w[11], w[12], offset);
      w[26] = hc_bytealign_S (w[10], w[11], offset);
      w[25] = hc_bytealign_S (w[ 9], w[10], offset);
      w[24] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign_S (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign_S (w[46], w[47], offset);
      w[62] = hc_bytealign_S (w[45], w[46], offset);
      w[61] = hc_bytealign_S (w[44], w[45], offset);
      w[60] = hc_bytealign_S (w[43], w[44], offset);
      w[59] = hc_bytealign_S (w[42], w[43], offset);
      w[58] = hc_bytealign_S (w[41], w[42], offset);
      w[57] = hc_bytealign_S (w[40], w[41], offset);
      w[56] = hc_bytealign_S (w[39], w[40], offset);
      w[55] = hc_bytealign_S (w[38], w[39], offset);
      w[54] = hc_bytealign_S (w[37], w[38], offset);
      w[53] = hc_bytealign_S (w[36], w[37], offset);
      w[52] = hc_bytealign_S (w[35], w[36], offset);
      w[51] = hc_bytealign_S (w[34], w[35], offset);
      w[50] = hc_bytealign_S (w[33], w[34], offset);
      w[49] = hc_bytealign_S (w[32], w[33], offset);
      w[48] = hc_bytealign_S (w[31], w[32], offset);
      w[47] = hc_bytealign_S (w[30], w[31], offset);
      w[46] = hc_bytealign_S (w[29], w[30], offset);
      w[45] = hc_bytealign_S (w[28], w[29], offset);
      w[44] = hc_bytealign_S (w[27], w[28], offset);
      w[43] = hc_bytealign_S (w[26], w[27], offset);
      w[42] = hc_bytealign_S (w[25], w[26], offset);
      w[41] = hc_bytealign_S (w[24], w[25], offset);
      w[40] = hc_bytealign_S (w[23], w[24], offset);
      w[39] = hc_bytealign_S (w[22], w[23], offset);
      w[38] = hc_bytealign_S (w[21], w[22], offset);
      w[37] = hc_bytealign_S (w[20], w[21], offset);
      w[36] = hc_bytealign_S (w[19], w[20], offset);
      w[35] = hc_bytealign_S (w[18], w[19], offset);
      w[34] = hc_bytealign_S (w[17], w[18], offset);
      w[33] = hc_bytealign_S (w[16], w[17], offset);
      w[32] = hc_bytealign_S (w[15], w[16], offset);
      w[31] = hc_bytealign_S (w[14], w[15], offset);
      w[30] = hc_bytealign_S (w[13], w[14], offset);
      w[29] = hc_bytealign_S (w[12], w[13], offset);
      w[28] = hc_bytealign_S (w[11], w[12], offset);
      w[27] = hc_bytealign_S (w[10], w[11], offset);
      w[26] = hc_bytealign_S (w[ 9], w[10], offset);
      w[25] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign_S (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign_S (w[45], w[46], offset);
      w[62] = hc_bytealign_S (w[44], w[45], offset);
      w[61] = hc_bytealign_S (w[43], w[44], offset);
      w[60] = hc_bytealign_S (w[42], w[43], offset);
      w[59] = hc_bytealign_S (w[41], w[42], offset);
      w[58] = hc_bytealign_S (w[40], w[41], offset);
      w[57] = hc_bytealign_S (w[39], w[40], offset);
      w[56] = hc_bytealign_S (w[38], w[39], offset);
      w[55] = hc_bytealign_S (w[37], w[38], offset);
      w[54] = hc_bytealign_S (w[36], w[37], offset);
      w[53] = hc_bytealign_S (w[35], w[36], offset);
      w[52] = hc_bytealign_S (w[34], w[35], offset);
      w[51] = hc_bytealign_S (w[33], w[34], offset);
      w[50] = hc_bytealign_S (w[32], w[33], offset);
      w[49] = hc_bytealign_S (w[31], w[32], offset);
      w[48] = hc_bytealign_S (w[30], w[31], offset);
      w[47] = hc_bytealign_S (w[29], w[30], offset);
      w[46] = hc_bytealign_S (w[28], w[29], offset);
      w[45] = hc_bytealign_S (w[27], w[28], offset);
      w[44] = hc_bytealign_S (w[26], w[27], offset);
      w[43] = hc_bytealign_S (w[25], w[26], offset);
      w[42] = hc_bytealign_S (w[24], w[25], offset);
      w[41] = hc_bytealign_S (w[23], w[24], offset);
      w[40] = hc_bytealign_S (w[22], w[23], offset);
      w[39] = hc_bytealign_S (w[21], w[22], offset);
      w[38] = hc_bytealign_S (w[20], w[21], offset);
      w[37] = hc_bytealign_S (w[19], w[20], offset);
      w[36] = hc_bytealign_S (w[18], w[19], offset);
      w[35] = hc_bytealign_S (w[17], w[18], offset);
      w[34] = hc_bytealign_S (w[16], w[17], offset);
      w[33] = hc_bytealign_S (w[15], w[16], offset);
      w[32] = hc_bytealign_S (w[14], w[15], offset);
      w[31] = hc_bytealign_S (w[13], w[14], offset);
      w[30] = hc_bytealign_S (w[12], w[13], offset);
      w[29] = hc_bytealign_S (w[11], w[12], offset);
      w[28] = hc_bytealign_S (w[10], w[11], offset);
      w[27] = hc_bytealign_S (w[ 9], w[10], offset);
      w[26] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign_S (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign_S (w[44], w[45], offset);
      w[62] = hc_bytealign_S (w[43], w[44], offset);
      w[61] = hc_bytealign_S (w[42], w[43], offset);
      w[60] = hc_bytealign_S (w[41], w[42], offset);
      w[59] = hc_bytealign_S (w[40], w[41], offset);
      w[58] = hc_bytealign_S (w[39], w[40], offset);
      w[57] = hc_bytealign_S (w[38], w[39], offset);
      w[56] = hc_bytealign_S (w[37], w[38], offset);
      w[55] = hc_bytealign_S (w[36], w[37], offset);
      w[54] = hc_bytealign_S (w[35], w[36], offset);
      w[53] = hc_bytealign_S (w[34], w[35], offset);
      w[52] = hc_bytealign_S (w[33], w[34], offset);
      w[51] = hc_bytealign_S (w[32], w[33], offset);
      w[50] = hc_bytealign_S (w[31], w[32], offset);
      w[49] = hc_bytealign_S (w[30], w[31], offset);
      w[48] = hc_bytealign_S (w[29], w[30], offset);
      w[47] = hc_bytealign_S (w[28], w[29], offset);
      w[46] = hc_bytealign_S (w[27], w[28], offset);
      w[45] = hc_bytealign_S (w[26], w[27], offset);
      w[44] = hc_bytealign_S (w[25], w[26], offset);
      w[43] = hc_bytealign_S (w[24], w[25], offset);
      w[42] = hc_bytealign_S (w[23], w[24], offset);
      w[41] = hc_bytealign_S (w[22], w[23], offset);
      w[40] = hc_bytealign_S (w[21], w[22], offset);
      w[39] = hc_bytealign_S (w[20], w[21], offset);
      w[38] = hc_bytealign_S (w[19], w[20], offset);
      w[37] = hc_bytealign_S (w[18], w[19], offset);
      w[36] = hc_bytealign_S (w[17], w[18], offset);
      w[35] = hc_bytealign_S (w[16], w[17], offset);
      w[34] = hc_bytealign_S (w[15], w[16], offset);
      w[33] = hc_bytealign_S (w[14], w[15], offset);
      w[32] = hc_bytealign_S (w[13], w[14], offset);
      w[31] = hc_bytealign_S (w[12], w[13], offset);
      w[30] = hc_bytealign_S (w[11], w[12], offset);
      w[29] = hc_bytealign_S (w[10], w[11], offset);
      w[28] = hc_bytealign_S (w[ 9], w[10], offset);
      w[27] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign_S (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign_S (w[43], w[44], offset);
      w[62] = hc_bytealign_S (w[42], w[43], offset);
      w[61] = hc_bytealign_S (w[41], w[42], offset);
      w[60] = hc_bytealign_S (w[40], w[41], offset);
      w[59] = hc_bytealign_S (w[39], w[40], offset);
      w[58] = hc_bytealign_S (w[38], w[39], offset);
      w[57] = hc_bytealign_S (w[37], w[38], offset);
      w[56] = hc_bytealign_S (w[36], w[37], offset);
      w[55] = hc_bytealign_S (w[35], w[36], offset);
      w[54] = hc_bytealign_S (w[34], w[35], offset);
      w[53] = hc_bytealign_S (w[33], w[34], offset);
      w[52] = hc_bytealign_S (w[32], w[33], offset);
      w[51] = hc_bytealign_S (w[31], w[32], offset);
      w[50] = hc_bytealign_S (w[30], w[31], offset);
      w[49] = hc_bytealign_S (w[29], w[30], offset);
      w[48] = hc_bytealign_S (w[28], w[29], offset);
      w[47] = hc_bytealign_S (w[27], w[28], offset);
      w[46] = hc_bytealign_S (w[26], w[27], offset);
      w[45] = hc_bytealign_S (w[25], w[26], offset);
      w[44] = hc_bytealign_S (w[24], w[25], offset);
      w[43] = hc_bytealign_S (w[23], w[24], offset);
      w[42] = hc_bytealign_S (w[22], w[23], offset);
      w[41] = hc_bytealign_S (w[21], w[22], offset);
      w[40] = hc_bytealign_S (w[20], w[21], offset);
      w[39] = hc_bytealign_S (w[19], w[20], offset);
      w[38] = hc_bytealign_S (w[18], w[19], offset);
      w[37] = hc_bytealign_S (w[17], w[18], offset);
      w[36] = hc_bytealign_S (w[16], w[17], offset);
      w[35] = hc_bytealign_S (w[15], w[16], offset);
      w[34] = hc_bytealign_S (w[14], w[15], offset);
      w[33] = hc_bytealign_S (w[13], w[14], offset);
      w[32] = hc_bytealign_S (w[12], w[13], offset);
      w[31] = hc_bytealign_S (w[11], w[12], offset);
      w[30] = hc_bytealign_S (w[10], w[11], offset);
      w[29] = hc_bytealign_S (w[ 9], w[10], offset);
      w[28] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign_S (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign_S (w[42], w[43], offset);
      w[62] = hc_bytealign_S (w[41], w[42], offset);
      w[61] = hc_bytealign_S (w[40], w[41], offset);
      w[60] = hc_bytealign_S (w[39], w[40], offset);
      w[59] = hc_bytealign_S (w[38], w[39], offset);
      w[58] = hc_bytealign_S (w[37], w[38], offset);
      w[57] = hc_bytealign_S (w[36], w[37], offset);
      w[56] = hc_bytealign_S (w[35], w[36], offset);
      w[55] = hc_bytealign_S (w[34], w[35], offset);
      w[54] = hc_bytealign_S (w[33], w[34], offset);
      w[53] = hc_bytealign_S (w[32], w[33], offset);
      w[52] = hc_bytealign_S (w[31], w[32], offset);
      w[51] = hc_bytealign_S (w[30], w[31], offset);
      w[50] = hc_bytealign_S (w[29], w[30], offset);
      w[49] = hc_bytealign_S (w[28], w[29], offset);
      w[48] = hc_bytealign_S (w[27], w[28], offset);
      w[47] = hc_bytealign_S (w[26], w[27], offset);
      w[46] = hc_bytealign_S (w[25], w[26], offset);
      w[45] = hc_bytealign_S (w[24], w[25], offset);
      w[44] = hc_bytealign_S (w[23], w[24], offset);
      w[43] = hc_bytealign_S (w[22], w[23], offset);
      w[42] = hc_bytealign_S (w[21], w[22], offset);
      w[41] = hc_bytealign_S (w[20], w[21], offset);
      w[40] = hc_bytealign_S (w[19], w[20], offset);
      w[39] = hc_bytealign_S (w[18], w[19], offset);
      w[38] = hc_bytealign_S (w[17], w[18], offset);
      w[37] = hc_bytealign_S (w[16], w[17], offset);
      w[36] = hc_bytealign_S (w[15], w[16], offset);
      w[35] = hc_bytealign_S (w[14], w[15], offset);
      w[34] = hc_bytealign_S (w[13], w[14], offset);
      w[33] = hc_bytealign_S (w[12], w[13], offset);
      w[32] = hc_bytealign_S (w[11], w[12], offset);
      w[31] = hc_bytealign_S (w[10], w[11], offset);
      w[30] = hc_bytealign_S (w[ 9], w[10], offset);
      w[29] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign_S (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign_S (w[41], w[42], offset);
      w[62] = hc_bytealign_S (w[40], w[41], offset);
      w[61] = hc_bytealign_S (w[39], w[40], offset);
      w[60] = hc_bytealign_S (w[38], w[39], offset);
      w[59] = hc_bytealign_S (w[37], w[38], offset);
      w[58] = hc_bytealign_S (w[36], w[37], offset);
      w[57] = hc_bytealign_S (w[35], w[36], offset);
      w[56] = hc_bytealign_S (w[34], w[35], offset);
      w[55] = hc_bytealign_S (w[33], w[34], offset);
      w[54] = hc_bytealign_S (w[32], w[33], offset);
      w[53] = hc_bytealign_S (w[31], w[32], offset);
      w[52] = hc_bytealign_S (w[30], w[31], offset);
      w[51] = hc_bytealign_S (w[29], w[30], offset);
      w[50] = hc_bytealign_S (w[28], w[29], offset);
      w[49] = hc_bytealign_S (w[27], w[28], offset);
      w[48] = hc_bytealign_S (w[26], w[27], offset);
      w[47] = hc_bytealign_S (w[25], w[26], offset);
      w[46] = hc_bytealign_S (w[24], w[25], offset);
      w[45] = hc_bytealign_S (w[23], w[24], offset);
      w[44] = hc_bytealign_S (w[22], w[23], offset);
      w[43] = hc_bytealign_S (w[21], w[22], offset);
      w[42] = hc_bytealign_S (w[20], w[21], offset);
      w[41] = hc_bytealign_S (w[19], w[20], offset);
      w[40] = hc_bytealign_S (w[18], w[19], offset);
      w[39] = hc_bytealign_S (w[17], w[18], offset);
      w[38] = hc_bytealign_S (w[16], w[17], offset);
      w[37] = hc_bytealign_S (w[15], w[16], offset);
      w[36] = hc_bytealign_S (w[14], w[15], offset);
      w[35] = hc_bytealign_S (w[13], w[14], offset);
      w[34] = hc_bytealign_S (w[12], w[13], offset);
      w[33] = hc_bytealign_S (w[11], w[12], offset);
      w[32] = hc_bytealign_S (w[10], w[11], offset);
      w[31] = hc_bytealign_S (w[ 9], w[10], offset);
      w[30] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign_S (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign_S (w[40], w[41], offset);
      w[62] = hc_bytealign_S (w[39], w[40], offset);
      w[61] = hc_bytealign_S (w[38], w[39], offset);
      w[60] = hc_bytealign_S (w[37], w[38], offset);
      w[59] = hc_bytealign_S (w[36], w[37], offset);
      w[58] = hc_bytealign_S (w[35], w[36], offset);
      w[57] = hc_bytealign_S (w[34], w[35], offset);
      w[56] = hc_bytealign_S (w[33], w[34], offset);
      w[55] = hc_bytealign_S (w[32], w[33], offset);
      w[54] = hc_bytealign_S (w[31], w[32], offset);
      w[53] = hc_bytealign_S (w[30], w[31], offset);
      w[52] = hc_bytealign_S (w[29], w[30], offset);
      w[51] = hc_bytealign_S (w[28], w[29], offset);
      w[50] = hc_bytealign_S (w[27], w[28], offset);
      w[49] = hc_bytealign_S (w[26], w[27], offset);
      w[48] = hc_bytealign_S (w[25], w[26], offset);
      w[47] = hc_bytealign_S (w[24], w[25], offset);
      w[46] = hc_bytealign_S (w[23], w[24], offset);
      w[45] = hc_bytealign_S (w[22], w[23], offset);
      w[44] = hc_bytealign_S (w[21], w[22], offset);
      w[43] = hc_bytealign_S (w[20], w[21], offset);
      w[42] = hc_bytealign_S (w[19], w[20], offset);
      w[41] = hc_bytealign_S (w[18], w[19], offset);
      w[40] = hc_bytealign_S (w[17], w[18], offset);
      w[39] = hc_bytealign_S (w[16], w[17], offset);
      w[38] = hc_bytealign_S (w[15], w[16], offset);
      w[37] = hc_bytealign_S (w[14], w[15], offset);
      w[36] = hc_bytealign_S (w[13], w[14], offset);
      w[35] = hc_bytealign_S (w[12], w[13], offset);
      w[34] = hc_bytealign_S (w[11], w[12], offset);
      w[33] = hc_bytealign_S (w[10], w[11], offset);
      w[32] = hc_bytealign_S (w[ 9], w[10], offset);
      w[31] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign_S (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign_S (w[39], w[40], offset);
      w[62] = hc_bytealign_S (w[38], w[39], offset);
      w[61] = hc_bytealign_S (w[37], w[38], offset);
      w[60] = hc_bytealign_S (w[36], w[37], offset);
      w[59] = hc_bytealign_S (w[35], w[36], offset);
      w[58] = hc_bytealign_S (w[34], w[35], offset);
      w[57] = hc_bytealign_S (w[33], w[34], offset);
      w[56] = hc_bytealign_S (w[32], w[33], offset);
      w[55] = hc_bytealign_S (w[31], w[32], offset);
      w[54] = hc_bytealign_S (w[30], w[31], offset);
      w[53] = hc_bytealign_S (w[29], w[30], offset);
      w[52] = hc_bytealign_S (w[28], w[29], offset);
      w[51] = hc_bytealign_S (w[27], w[28], offset);
      w[50] = hc_bytealign_S (w[26], w[27], offset);
      w[49] = hc_bytealign_S (w[25], w[26], offset);
      w[48] = hc_bytealign_S (w[24], w[25], offset);
      w[47] = hc_bytealign_S (w[23], w[24], offset);
      w[46] = hc_bytealign_S (w[22], w[23], offset);
      w[45] = hc_bytealign_S (w[21], w[22], offset);
      w[44] = hc_bytealign_S (w[20], w[21], offset);
      w[43] = hc_bytealign_S (w[19], w[20], offset);
      w[42] = hc_bytealign_S (w[18], w[19], offset);
      w[41] = hc_bytealign_S (w[17], w[18], offset);
      w[40] = hc_bytealign_S (w[16], w[17], offset);
      w[39] = hc_bytealign_S (w[15], w[16], offset);
      w[38] = hc_bytealign_S (w[14], w[15], offset);
      w[37] = hc_bytealign_S (w[13], w[14], offset);
      w[36] = hc_bytealign_S (w[12], w[13], offset);
      w[35] = hc_bytealign_S (w[11], w[12], offset);
      w[34] = hc_bytealign_S (w[10], w[11], offset);
      w[33] = hc_bytealign_S (w[ 9], w[10], offset);
      w[32] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign_S (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign_S (w[38], w[39], offset);
      w[62] = hc_bytealign_S (w[37], w[38], offset);
      w[61] = hc_bytealign_S (w[36], w[37], offset);
      w[60] = hc_bytealign_S (w[35], w[36], offset);
      w[59] = hc_bytealign_S (w[34], w[35], offset);
      w[58] = hc_bytealign_S (w[33], w[34], offset);
      w[57] = hc_bytealign_S (w[32], w[33], offset);
      w[56] = hc_bytealign_S (w[31], w[32], offset);
      w[55] = hc_bytealign_S (w[30], w[31], offset);
      w[54] = hc_bytealign_S (w[29], w[30], offset);
      w[53] = hc_bytealign_S (w[28], w[29], offset);
      w[52] = hc_bytealign_S (w[27], w[28], offset);
      w[51] = hc_bytealign_S (w[26], w[27], offset);
      w[50] = hc_bytealign_S (w[25], w[26], offset);
      w[49] = hc_bytealign_S (w[24], w[25], offset);
      w[48] = hc_bytealign_S (w[23], w[24], offset);
      w[47] = hc_bytealign_S (w[22], w[23], offset);
      w[46] = hc_bytealign_S (w[21], w[22], offset);
      w[45] = hc_bytealign_S (w[20], w[21], offset);
      w[44] = hc_bytealign_S (w[19], w[20], offset);
      w[43] = hc_bytealign_S (w[18], w[19], offset);
      w[42] = hc_bytealign_S (w[17], w[18], offset);
      w[41] = hc_bytealign_S (w[16], w[17], offset);
      w[40] = hc_bytealign_S (w[15], w[16], offset);
      w[39] = hc_bytealign_S (w[14], w[15], offset);
      w[38] = hc_bytealign_S (w[13], w[14], offset);
      w[37] = hc_bytealign_S (w[12], w[13], offset);
      w[36] = hc_bytealign_S (w[11], w[12], offset);
      w[35] = hc_bytealign_S (w[10], w[11], offset);
      w[34] = hc_bytealign_S (w[ 9], w[10], offset);
      w[33] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign_S (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign_S (w[37], w[38], offset);
      w[62] = hc_bytealign_S (w[36], w[37], offset);
      w[61] = hc_bytealign_S (w[35], w[36], offset);
      w[60] = hc_bytealign_S (w[34], w[35], offset);
      w[59] = hc_bytealign_S (w[33], w[34], offset);
      w[58] = hc_bytealign_S (w[32], w[33], offset);
      w[57] = hc_bytealign_S (w[31], w[32], offset);
      w[56] = hc_bytealign_S (w[30], w[31], offset);
      w[55] = hc_bytealign_S (w[29], w[30], offset);
      w[54] = hc_bytealign_S (w[28], w[29], offset);
      w[53] = hc_bytealign_S (w[27], w[28], offset);
      w[52] = hc_bytealign_S (w[26], w[27], offset);
      w[51] = hc_bytealign_S (w[25], w[26], offset);
      w[50] = hc_bytealign_S (w[24], w[25], offset);
      w[49] = hc_bytealign_S (w[23], w[24], offset);
      w[48] = hc_bytealign_S (w[22], w[23], offset);
      w[47] = hc_bytealign_S (w[21], w[22], offset);
      w[46] = hc_bytealign_S (w[20], w[21], offset);
      w[45] = hc_bytealign_S (w[19], w[20], offset);
      w[44] = hc_bytealign_S (w[18], w[19], offset);
      w[43] = hc_bytealign_S (w[17], w[18], offset);
      w[42] = hc_bytealign_S (w[16], w[17], offset);
      w[41] = hc_bytealign_S (w[15], w[16], offset);
      w[40] = hc_bytealign_S (w[14], w[15], offset);
      w[39] = hc_bytealign_S (w[13], w[14], offset);
      w[38] = hc_bytealign_S (w[12], w[13], offset);
      w[37] = hc_bytealign_S (w[11], w[12], offset);
      w[36] = hc_bytealign_S (w[10], w[11], offset);
      w[35] = hc_bytealign_S (w[ 9], w[10], offset);
      w[34] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign_S (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign_S (w[36], w[37], offset);
      w[62] = hc_bytealign_S (w[35], w[36], offset);
      w[61] = hc_bytealign_S (w[34], w[35], offset);
      w[60] = hc_bytealign_S (w[33], w[34], offset);
      w[59] = hc_bytealign_S (w[32], w[33], offset);
      w[58] = hc_bytealign_S (w[31], w[32], offset);
      w[57] = hc_bytealign_S (w[30], w[31], offset);
      w[56] = hc_bytealign_S (w[29], w[30], offset);
      w[55] = hc_bytealign_S (w[28], w[29], offset);
      w[54] = hc_bytealign_S (w[27], w[28], offset);
      w[53] = hc_bytealign_S (w[26], w[27], offset);
      w[52] = hc_bytealign_S (w[25], w[26], offset);
      w[51] = hc_bytealign_S (w[24], w[25], offset);
      w[50] = hc_bytealign_S (w[23], w[24], offset);
      w[49] = hc_bytealign_S (w[22], w[23], offset);
      w[48] = hc_bytealign_S (w[21], w[22], offset);
      w[47] = hc_bytealign_S (w[20], w[21], offset);
      w[46] = hc_bytealign_S (w[19], w[20], offset);
      w[45] = hc_bytealign_S (w[18], w[19], offset);
      w[44] = hc_bytealign_S (w[17], w[18], offset);
      w[43] = hc_bytealign_S (w[16], w[17], offset);
      w[42] = hc_bytealign_S (w[15], w[16], offset);
      w[41] = hc_bytealign_S (w[14], w[15], offset);
      w[40] = hc_bytealign_S (w[13], w[14], offset);
      w[39] = hc_bytealign_S (w[12], w[13], offset);
      w[38] = hc_bytealign_S (w[11], w[12], offset);
      w[37] = hc_bytealign_S (w[10], w[11], offset);
      w[36] = hc_bytealign_S (w[ 9], w[10], offset);
      w[35] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign_S (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign_S (w[35], w[36], offset);
      w[62] = hc_bytealign_S (w[34], w[35], offset);
      w[61] = hc_bytealign_S (w[33], w[34], offset);
      w[60] = hc_bytealign_S (w[32], w[33], offset);
      w[59] = hc_bytealign_S (w[31], w[32], offset);
      w[58] = hc_bytealign_S (w[30], w[31], offset);
      w[57] = hc_bytealign_S (w[29], w[30], offset);
      w[56] = hc_bytealign_S (w[28], w[29], offset);
      w[55] = hc_bytealign_S (w[27], w[28], offset);
      w[54] = hc_bytealign_S (w[26], w[27], offset);
      w[53] = hc_bytealign_S (w[25], w[26], offset);
      w[52] = hc_bytealign_S (w[24], w[25], offset);
      w[51] = hc_bytealign_S (w[23], w[24], offset);
      w[50] = hc_bytealign_S (w[22], w[23], offset);
      w[49] = hc_bytealign_S (w[21], w[22], offset);
      w[48] = hc_bytealign_S (w[20], w[21], offset);
      w[47] = hc_bytealign_S (w[19], w[20], offset);
      w[46] = hc_bytealign_S (w[18], w[19], offset);
      w[45] = hc_bytealign_S (w[17], w[18], offset);
      w[44] = hc_bytealign_S (w[16], w[17], offset);
      w[43] = hc_bytealign_S (w[15], w[16], offset);
      w[42] = hc_bytealign_S (w[14], w[15], offset);
      w[41] = hc_bytealign_S (w[13], w[14], offset);
      w[40] = hc_bytealign_S (w[12], w[13], offset);
      w[39] = hc_bytealign_S (w[11], w[12], offset);
      w[38] = hc_bytealign_S (w[10], w[11], offset);
      w[37] = hc_bytealign_S (w[ 9], w[10], offset);
      w[36] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign_S (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign_S (w[34], w[35], offset);
      w[62] = hc_bytealign_S (w[33], w[34], offset);
      w[61] = hc_bytealign_S (w[32], w[33], offset);
      w[60] = hc_bytealign_S (w[31], w[32], offset);
      w[59] = hc_bytealign_S (w[30], w[31], offset);
      w[58] = hc_bytealign_S (w[29], w[30], offset);
      w[57] = hc_bytealign_S (w[28], w[29], offset);
      w[56] = hc_bytealign_S (w[27], w[28], offset);
      w[55] = hc_bytealign_S (w[26], w[27], offset);
      w[54] = hc_bytealign_S (w[25], w[26], offset);
      w[53] = hc_bytealign_S (w[24], w[25], offset);
      w[52] = hc_bytealign_S (w[23], w[24], offset);
      w[51] = hc_bytealign_S (w[22], w[23], offset);
      w[50] = hc_bytealign_S (w[21], w[22], offset);
      w[49] = hc_bytealign_S (w[20], w[21], offset);
      w[48] = hc_bytealign_S (w[19], w[20], offset);
      w[47] = hc_bytealign_S (w[18], w[19], offset);
      w[46] = hc_bytealign_S (w[17], w[18], offset);
      w[45] = hc_bytealign_S (w[16], w[17], offset);
      w[44] = hc_bytealign_S (w[15], w[16], offset);
      w[43] = hc_bytealign_S (w[14], w[15], offset);
      w[42] = hc_bytealign_S (w[13], w[14], offset);
      w[41] = hc_bytealign_S (w[12], w[13], offset);
      w[40] = hc_bytealign_S (w[11], w[12], offset);
      w[39] = hc_bytealign_S (w[10], w[11], offset);
      w[38] = hc_bytealign_S (w[ 9], w[10], offset);
      w[37] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign_S (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign_S (w[33], w[34], offset);
      w[62] = hc_bytealign_S (w[32], w[33], offset);
      w[61] = hc_bytealign_S (w[31], w[32], offset);
      w[60] = hc_bytealign_S (w[30], w[31], offset);
      w[59] = hc_bytealign_S (w[29], w[30], offset);
      w[58] = hc_bytealign_S (w[28], w[29], offset);
      w[57] = hc_bytealign_S (w[27], w[28], offset);
      w[56] = hc_bytealign_S (w[26], w[27], offset);
      w[55] = hc_bytealign_S (w[25], w[26], offset);
      w[54] = hc_bytealign_S (w[24], w[25], offset);
      w[53] = hc_bytealign_S (w[23], w[24], offset);
      w[52] = hc_bytealign_S (w[22], w[23], offset);
      w[51] = hc_bytealign_S (w[21], w[22], offset);
      w[50] = hc_bytealign_S (w[20], w[21], offset);
      w[49] = hc_bytealign_S (w[19], w[20], offset);
      w[48] = hc_bytealign_S (w[18], w[19], offset);
      w[47] = hc_bytealign_S (w[17], w[18], offset);
      w[46] = hc_bytealign_S (w[16], w[17], offset);
      w[45] = hc_bytealign_S (w[15], w[16], offset);
      w[44] = hc_bytealign_S (w[14], w[15], offset);
      w[43] = hc_bytealign_S (w[13], w[14], offset);
      w[42] = hc_bytealign_S (w[12], w[13], offset);
      w[41] = hc_bytealign_S (w[11], w[12], offset);
      w[40] = hc_bytealign_S (w[10], w[11], offset);
      w[39] = hc_bytealign_S (w[ 9], w[10], offset);
      w[38] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign_S (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign_S (w[32], w[33], offset);
      w[62] = hc_bytealign_S (w[31], w[32], offset);
      w[61] = hc_bytealign_S (w[30], w[31], offset);
      w[60] = hc_bytealign_S (w[29], w[30], offset);
      w[59] = hc_bytealign_S (w[28], w[29], offset);
      w[58] = hc_bytealign_S (w[27], w[28], offset);
      w[57] = hc_bytealign_S (w[26], w[27], offset);
      w[56] = hc_bytealign_S (w[25], w[26], offset);
      w[55] = hc_bytealign_S (w[24], w[25], offset);
      w[54] = hc_bytealign_S (w[23], w[24], offset);
      w[53] = hc_bytealign_S (w[22], w[23], offset);
      w[52] = hc_bytealign_S (w[21], w[22], offset);
      w[51] = hc_bytealign_S (w[20], w[21], offset);
      w[50] = hc_bytealign_S (w[19], w[20], offset);
      w[49] = hc_bytealign_S (w[18], w[19], offset);
      w[48] = hc_bytealign_S (w[17], w[18], offset);
      w[47] = hc_bytealign_S (w[16], w[17], offset);
      w[46] = hc_bytealign_S (w[15], w[16], offset);
      w[45] = hc_bytealign_S (w[14], w[15], offset);
      w[44] = hc_bytealign_S (w[13], w[14], offset);
      w[43] = hc_bytealign_S (w[12], w[13], offset);
      w[42] = hc_bytealign_S (w[11], w[12], offset);
      w[41] = hc_bytealign_S (w[10], w[11], offset);
      w[40] = hc_bytealign_S (w[ 9], w[10], offset);
      w[39] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign_S (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign_S (w[31], w[32], offset);
      w[62] = hc_bytealign_S (w[30], w[31], offset);
      w[61] = hc_bytealign_S (w[29], w[30], offset);
      w[60] = hc_bytealign_S (w[28], w[29], offset);
      w[59] = hc_bytealign_S (w[27], w[28], offset);
      w[58] = hc_bytealign_S (w[26], w[27], offset);
      w[57] = hc_bytealign_S (w[25], w[26], offset);
      w[56] = hc_bytealign_S (w[24], w[25], offset);
      w[55] = hc_bytealign_S (w[23], w[24], offset);
      w[54] = hc_bytealign_S (w[22], w[23], offset);
      w[53] = hc_bytealign_S (w[21], w[22], offset);
      w[52] = hc_bytealign_S (w[20], w[21], offset);
      w[51] = hc_bytealign_S (w[19], w[20], offset);
      w[50] = hc_bytealign_S (w[18], w[19], offset);
      w[49] = hc_bytealign_S (w[17], w[18], offset);
      w[48] = hc_bytealign_S (w[16], w[17], offset);
      w[47] = hc_bytealign_S (w[15], w[16], offset);
      w[46] = hc_bytealign_S (w[14], w[15], offset);
      w[45] = hc_bytealign_S (w[13], w[14], offset);
      w[44] = hc_bytealign_S (w[12], w[13], offset);
      w[43] = hc_bytealign_S (w[11], w[12], offset);
      w[42] = hc_bytealign_S (w[10], w[11], offset);
      w[41] = hc_bytealign_S (w[ 9], w[10], offset);
      w[40] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign_S (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign_S (w[30], w[31], offset);
      w[62] = hc_bytealign_S (w[29], w[30], offset);
      w[61] = hc_bytealign_S (w[28], w[29], offset);
      w[60] = hc_bytealign_S (w[27], w[28], offset);
      w[59] = hc_bytealign_S (w[26], w[27], offset);
      w[58] = hc_bytealign_S (w[25], w[26], offset);
      w[57] = hc_bytealign_S (w[24], w[25], offset);
      w[56] = hc_bytealign_S (w[23], w[24], offset);
      w[55] = hc_bytealign_S (w[22], w[23], offset);
      w[54] = hc_bytealign_S (w[21], w[22], offset);
      w[53] = hc_bytealign_S (w[20], w[21], offset);
      w[52] = hc_bytealign_S (w[19], w[20], offset);
      w[51] = hc_bytealign_S (w[18], w[19], offset);
      w[50] = hc_bytealign_S (w[17], w[18], offset);
      w[49] = hc_bytealign_S (w[16], w[17], offset);
      w[48] = hc_bytealign_S (w[15], w[16], offset);
      w[47] = hc_bytealign_S (w[14], w[15], offset);
      w[46] = hc_bytealign_S (w[13], w[14], offset);
      w[45] = hc_bytealign_S (w[12], w[13], offset);
      w[44] = hc_bytealign_S (w[11], w[12], offset);
      w[43] = hc_bytealign_S (w[10], w[11], offset);
      w[42] = hc_bytealign_S (w[ 9], w[10], offset);
      w[41] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign_S (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign_S (w[29], w[30], offset);
      w[62] = hc_bytealign_S (w[28], w[29], offset);
      w[61] = hc_bytealign_S (w[27], w[28], offset);
      w[60] = hc_bytealign_S (w[26], w[27], offset);
      w[59] = hc_bytealign_S (w[25], w[26], offset);
      w[58] = hc_bytealign_S (w[24], w[25], offset);
      w[57] = hc_bytealign_S (w[23], w[24], offset);
      w[56] = hc_bytealign_S (w[22], w[23], offset);
      w[55] = hc_bytealign_S (w[21], w[22], offset);
      w[54] = hc_bytealign_S (w[20], w[21], offset);
      w[53] = hc_bytealign_S (w[19], w[20], offset);
      w[52] = hc_bytealign_S (w[18], w[19], offset);
      w[51] = hc_bytealign_S (w[17], w[18], offset);
      w[50] = hc_bytealign_S (w[16], w[17], offset);
      w[49] = hc_bytealign_S (w[15], w[16], offset);
      w[48] = hc_bytealign_S (w[14], w[15], offset);
      w[47] = hc_bytealign_S (w[13], w[14], offset);
      w[46] = hc_bytealign_S (w[12], w[13], offset);
      w[45] = hc_bytealign_S (w[11], w[12], offset);
      w[44] = hc_bytealign_S (w[10], w[11], offset);
      w[43] = hc_bytealign_S (w[ 9], w[10], offset);
      w[42] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign_S (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign_S (w[28], w[29], offset);
      w[62] = hc_bytealign_S (w[27], w[28], offset);
      w[61] = hc_bytealign_S (w[26], w[27], offset);
      w[60] = hc_bytealign_S (w[25], w[26], offset);
      w[59] = hc_bytealign_S (w[24], w[25], offset);
      w[58] = hc_bytealign_S (w[23], w[24], offset);
      w[57] = hc_bytealign_S (w[22], w[23], offset);
      w[56] = hc_bytealign_S (w[21], w[22], offset);
      w[55] = hc_bytealign_S (w[20], w[21], offset);
      w[54] = hc_bytealign_S (w[19], w[20], offset);
      w[53] = hc_bytealign_S (w[18], w[19], offset);
      w[52] = hc_bytealign_S (w[17], w[18], offset);
      w[51] = hc_bytealign_S (w[16], w[17], offset);
      w[50] = hc_bytealign_S (w[15], w[16], offset);
      w[49] = hc_bytealign_S (w[14], w[15], offset);
      w[48] = hc_bytealign_S (w[13], w[14], offset);
      w[47] = hc_bytealign_S (w[12], w[13], offset);
      w[46] = hc_bytealign_S (w[11], w[12], offset);
      w[45] = hc_bytealign_S (w[10], w[11], offset);
      w[44] = hc_bytealign_S (w[ 9], w[10], offset);
      w[43] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign_S (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign_S (w[27], w[28], offset);
      w[62] = hc_bytealign_S (w[26], w[27], offset);
      w[61] = hc_bytealign_S (w[25], w[26], offset);
      w[60] = hc_bytealign_S (w[24], w[25], offset);
      w[59] = hc_bytealign_S (w[23], w[24], offset);
      w[58] = hc_bytealign_S (w[22], w[23], offset);
      w[57] = hc_bytealign_S (w[21], w[22], offset);
      w[56] = hc_bytealign_S (w[20], w[21], offset);
      w[55] = hc_bytealign_S (w[19], w[20], offset);
      w[54] = hc_bytealign_S (w[18], w[19], offset);
      w[53] = hc_bytealign_S (w[17], w[18], offset);
      w[52] = hc_bytealign_S (w[16], w[17], offset);
      w[51] = hc_bytealign_S (w[15], w[16], offset);
      w[50] = hc_bytealign_S (w[14], w[15], offset);
      w[49] = hc_bytealign_S (w[13], w[14], offset);
      w[48] = hc_bytealign_S (w[12], w[13], offset);
      w[47] = hc_bytealign_S (w[11], w[12], offset);
      w[46] = hc_bytealign_S (w[10], w[11], offset);
      w[45] = hc_bytealign_S (w[ 9], w[10], offset);
      w[44] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign_S (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign_S (w[26], w[27], offset);
      w[62] = hc_bytealign_S (w[25], w[26], offset);
      w[61] = hc_bytealign_S (w[24], w[25], offset);
      w[60] = hc_bytealign_S (w[23], w[24], offset);
      w[59] = hc_bytealign_S (w[22], w[23], offset);
      w[58] = hc_bytealign_S (w[21], w[22], offset);
      w[57] = hc_bytealign_S (w[20], w[21], offset);
      w[56] = hc_bytealign_S (w[19], w[20], offset);
      w[55] = hc_bytealign_S (w[18], w[19], offset);
      w[54] = hc_bytealign_S (w[17], w[18], offset);
      w[53] = hc_bytealign_S (w[16], w[17], offset);
      w[52] = hc_bytealign_S (w[15], w[16], offset);
      w[51] = hc_bytealign_S (w[14], w[15], offset);
      w[50] = hc_bytealign_S (w[13], w[14], offset);
      w[49] = hc_bytealign_S (w[12], w[13], offset);
      w[48] = hc_bytealign_S (w[11], w[12], offset);
      w[47] = hc_bytealign_S (w[10], w[11], offset);
      w[46] = hc_bytealign_S (w[ 9], w[10], offset);
      w[45] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign_S (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign_S (w[25], w[26], offset);
      w[62] = hc_bytealign_S (w[24], w[25], offset);
      w[61] = hc_bytealign_S (w[23], w[24], offset);
      w[60] = hc_bytealign_S (w[22], w[23], offset);
      w[59] = hc_bytealign_S (w[21], w[22], offset);
      w[58] = hc_bytealign_S (w[20], w[21], offset);
      w[57] = hc_bytealign_S (w[19], w[20], offset);
      w[56] = hc_bytealign_S (w[18], w[19], offset);
      w[55] = hc_bytealign_S (w[17], w[18], offset);
      w[54] = hc_bytealign_S (w[16], w[17], offset);
      w[53] = hc_bytealign_S (w[15], w[16], offset);
      w[52] = hc_bytealign_S (w[14], w[15], offset);
      w[51] = hc_bytealign_S (w[13], w[14], offset);
      w[50] = hc_bytealign_S (w[12], w[13], offset);
      w[49] = hc_bytealign_S (w[11], w[12], offset);
      w[48] = hc_bytealign_S (w[10], w[11], offset);
      w[47] = hc_bytealign_S (w[ 9], w[10], offset);
      w[46] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign_S (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign_S (w[24], w[25], offset);
      w[62] = hc_bytealign_S (w[23], w[24], offset);
      w[61] = hc_bytealign_S (w[22], w[23], offset);
      w[60] = hc_bytealign_S (w[21], w[22], offset);
      w[59] = hc_bytealign_S (w[20], w[21], offset);
      w[58] = hc_bytealign_S (w[19], w[20], offset);
      w[57] = hc_bytealign_S (w[18], w[19], offset);
      w[56] = hc_bytealign_S (w[17], w[18], offset);
      w[55] = hc_bytealign_S (w[16], w[17], offset);
      w[54] = hc_bytealign_S (w[15], w[16], offset);
      w[53] = hc_bytealign_S (w[14], w[15], offset);
      w[52] = hc_bytealign_S (w[13], w[14], offset);
      w[51] = hc_bytealign_S (w[12], w[13], offset);
      w[50] = hc_bytealign_S (w[11], w[12], offset);
      w[49] = hc_bytealign_S (w[10], w[11], offset);
      w[48] = hc_bytealign_S (w[ 9], w[10], offset);
      w[47] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign_S (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign_S (w[23], w[24], offset);
      w[62] = hc_bytealign_S (w[22], w[23], offset);
      w[61] = hc_bytealign_S (w[21], w[22], offset);
      w[60] = hc_bytealign_S (w[20], w[21], offset);
      w[59] = hc_bytealign_S (w[19], w[20], offset);
      w[58] = hc_bytealign_S (w[18], w[19], offset);
      w[57] = hc_bytealign_S (w[17], w[18], offset);
      w[56] = hc_bytealign_S (w[16], w[17], offset);
      w[55] = hc_bytealign_S (w[15], w[16], offset);
      w[54] = hc_bytealign_S (w[14], w[15], offset);
      w[53] = hc_bytealign_S (w[13], w[14], offset);
      w[52] = hc_bytealign_S (w[12], w[13], offset);
      w[51] = hc_bytealign_S (w[11], w[12], offset);
      w[50] = hc_bytealign_S (w[10], w[11], offset);
      w[49] = hc_bytealign_S (w[ 9], w[10], offset);
      w[48] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign_S (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign_S (w[22], w[23], offset);
      w[62] = hc_bytealign_S (w[21], w[22], offset);
      w[61] = hc_bytealign_S (w[20], w[21], offset);
      w[60] = hc_bytealign_S (w[19], w[20], offset);
      w[59] = hc_bytealign_S (w[18], w[19], offset);
      w[58] = hc_bytealign_S (w[17], w[18], offset);
      w[57] = hc_bytealign_S (w[16], w[17], offset);
      w[56] = hc_bytealign_S (w[15], w[16], offset);
      w[55] = hc_bytealign_S (w[14], w[15], offset);
      w[54] = hc_bytealign_S (w[13], w[14], offset);
      w[53] = hc_bytealign_S (w[12], w[13], offset);
      w[52] = hc_bytealign_S (w[11], w[12], offset);
      w[51] = hc_bytealign_S (w[10], w[11], offset);
      w[50] = hc_bytealign_S (w[ 9], w[10], offset);
      w[49] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign_S (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign_S (w[21], w[22], offset);
      w[62] = hc_bytealign_S (w[20], w[21], offset);
      w[61] = hc_bytealign_S (w[19], w[20], offset);
      w[60] = hc_bytealign_S (w[18], w[19], offset);
      w[59] = hc_bytealign_S (w[17], w[18], offset);
      w[58] = hc_bytealign_S (w[16], w[17], offset);
      w[57] = hc_bytealign_S (w[15], w[16], offset);
      w[56] = hc_bytealign_S (w[14], w[15], offset);
      w[55] = hc_bytealign_S (w[13], w[14], offset);
      w[54] = hc_bytealign_S (w[12], w[13], offset);
      w[53] = hc_bytealign_S (w[11], w[12], offset);
      w[52] = hc_bytealign_S (w[10], w[11], offset);
      w[51] = hc_bytealign_S (w[ 9], w[10], offset);
      w[50] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign_S (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign_S (w[20], w[21], offset);
      w[62] = hc_bytealign_S (w[19], w[20], offset);
      w[61] = hc_bytealign_S (w[18], w[19], offset);
      w[60] = hc_bytealign_S (w[17], w[18], offset);
      w[59] = hc_bytealign_S (w[16], w[17], offset);
      w[58] = hc_bytealign_S (w[15], w[16], offset);
      w[57] = hc_bytealign_S (w[14], w[15], offset);
      w[56] = hc_bytealign_S (w[13], w[14], offset);
      w[55] = hc_bytealign_S (w[12], w[13], offset);
      w[54] = hc_bytealign_S (w[11], w[12], offset);
      w[53] = hc_bytealign_S (w[10], w[11], offset);
      w[52] = hc_bytealign_S (w[ 9], w[10], offset);
      w[51] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign_S (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign_S (w[19], w[20], offset);
      w[62] = hc_bytealign_S (w[18], w[19], offset);
      w[61] = hc_bytealign_S (w[17], w[18], offset);
      w[60] = hc_bytealign_S (w[16], w[17], offset);
      w[59] = hc_bytealign_S (w[15], w[16], offset);
      w[58] = hc_bytealign_S (w[14], w[15], offset);
      w[57] = hc_bytealign_S (w[13], w[14], offset);
      w[56] = hc_bytealign_S (w[12], w[13], offset);
      w[55] = hc_bytealign_S (w[11], w[12], offset);
      w[54] = hc_bytealign_S (w[10], w[11], offset);
      w[53] = hc_bytealign_S (w[ 9], w[10], offset);
      w[52] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[44] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[43] = hc_bytealign_S (    0, w[ 0], offset);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_bytealign_S (w[18], w[19], offset);
      w[62] = hc_bytealign_S (w[17], w[18], offset);
      w[61] = hc_bytealign_S (w[16], w[17], offset);
      w[60] = hc_bytealign_S (w[15], w[16], offset);
      w[59] = hc_bytealign_S (w[14], w[15], offset);
      w[58] = hc_bytealign_S (w[13], w[14], offset);
      w[57] = hc_bytealign_S (w[12], w[13], offset);
      w[56] = hc_bytealign_S (w[11], w[12], offset);
      w[55] = hc_bytealign_S (w[10], w[11], offset);
      w[54] = hc_bytealign_S (w[ 9], w[10], offset);
      w[53] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[52] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[51] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[50] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[49] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[48] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[47] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[46] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[45] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[44] = hc_bytealign_S (    0, w[ 0], offset);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_bytealign_S (w[17], w[18], offset);
      w[62] = hc_bytealign_S (w[16], w[17], offset);
      w[61] = hc_bytealign_S (w[15], w[16], offset);
      w[60] = hc_bytealign_S (w[14], w[15], offset);
      w[59] = hc_bytealign_S (w[13], w[14], offset);
      w[58] = hc_bytealign_S (w[12], w[13], offset);
      w[57] = hc_bytealign_S (w[11], w[12], offset);
      w[56] = hc_bytealign_S (w[10], w[11], offset);
      w[55] = hc_bytealign_S (w[ 9], w[10], offset);
      w[54] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[53] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[52] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[51] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[50] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[49] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[48] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[47] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[46] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[45] = hc_bytealign_S (    0, w[ 0], offset);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_bytealign_S (w[16], w[17], offset);
      w[62] = hc_bytealign_S (w[15], w[16], offset);
      w[61] = hc_bytealign_S (w[14], w[15], offset);
      w[60] = hc_bytealign_S (w[13], w[14], offset);
      w[59] = hc_bytealign_S (w[12], w[13], offset);
      w[58] = hc_bytealign_S (w[11], w[12], offset);
      w[57] = hc_bytealign_S (w[10], w[11], offset);
      w[56] = hc_bytealign_S (w[ 9], w[10], offset);
      w[55] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[54] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[53] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[52] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[51] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[50] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[49] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[48] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[47] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[46] = hc_bytealign_S (    0, w[ 0], offset);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_bytealign_S (w[15], w[16], offset);
      w[62] = hc_bytealign_S (w[14], w[15], offset);
      w[61] = hc_bytealign_S (w[13], w[14], offset);
      w[60] = hc_bytealign_S (w[12], w[13], offset);
      w[59] = hc_bytealign_S (w[11], w[12], offset);
      w[58] = hc_bytealign_S (w[10], w[11], offset);
      w[57] = hc_bytealign_S (w[ 9], w[10], offset);
      w[56] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[55] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[54] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[53] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[52] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[51] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[50] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[49] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[48] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[47] = hc_bytealign_S (    0, w[ 0], offset);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_bytealign_S (w[14], w[15], offset);
      w[62] = hc_bytealign_S (w[13], w[14], offset);
      w[61] = hc_bytealign_S (w[12], w[13], offset);
      w[60] = hc_bytealign_S (w[11], w[12], offset);
      w[59] = hc_bytealign_S (w[10], w[11], offset);
      w[58] = hc_bytealign_S (w[ 9], w[10], offset);
      w[57] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[56] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[55] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[54] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[53] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[52] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[51] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[50] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[49] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[48] = hc_bytealign_S (    0, w[ 0], offset);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_bytealign_S (w[13], w[14], offset);
      w[62] = hc_bytealign_S (w[12], w[13], offset);
      w[61] = hc_bytealign_S (w[11], w[12], offset);
      w[60] = hc_bytealign_S (w[10], w[11], offset);
      w[59] = hc_bytealign_S (w[ 9], w[10], offset);
      w[58] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[57] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[56] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[55] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[54] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[53] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[52] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[51] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[50] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[49] = hc_bytealign_S (    0, w[ 0], offset);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_bytealign_S (w[12], w[13], offset);
      w[62] = hc_bytealign_S (w[11], w[12], offset);
      w[61] = hc_bytealign_S (w[10], w[11], offset);
      w[60] = hc_bytealign_S (w[ 9], w[10], offset);
      w[59] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[58] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[57] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[56] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[55] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[54] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[53] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[52] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[51] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[50] = hc_bytealign_S (    0, w[ 0], offset);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_bytealign_S (w[11], w[12], offset);
      w[62] = hc_bytealign_S (w[10], w[11], offset);
      w[61] = hc_bytealign_S (w[ 9], w[10], offset);
      w[60] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[59] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[58] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[57] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[56] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[55] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[54] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[53] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[52] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[51] = hc_bytealign_S (    0, w[ 0], offset);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_bytealign_S (w[10], w[11], offset);
      w[62] = hc_bytealign_S (w[ 9], w[10], offset);
      w[61] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[60] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[59] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[58] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[57] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[56] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[55] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[54] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[53] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[52] = hc_bytealign_S (    0, w[ 0], offset);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_bytealign_S (w[ 9], w[10], offset);
      w[62] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[61] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[60] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[59] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[58] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[57] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[56] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[55] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[54] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[53] = hc_bytealign_S (    0, w[ 0], offset);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_bytealign_S (w[ 8], w[ 9], offset);
      w[62] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[61] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[60] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[59] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[58] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[57] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[56] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[55] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[54] = hc_bytealign_S (    0, w[ 0], offset);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_bytealign_S (w[ 7], w[ 8], offset);
      w[62] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[61] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[60] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[59] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[58] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[57] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[56] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[55] = hc_bytealign_S (    0, w[ 0], offset);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_bytealign_S (w[ 6], w[ 7], offset);
      w[62] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[61] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[60] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[59] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[58] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[57] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[56] = hc_bytealign_S (    0, w[ 0], offset);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_bytealign_S (w[ 5], w[ 6], offset);
      w[62] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[61] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[60] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[59] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[58] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[57] = hc_bytealign_S (    0, w[ 0], offset);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_bytealign_S (w[ 4], w[ 5], offset);
      w[62] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[61] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[60] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[59] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[58] = hc_bytealign_S (    0, w[ 0], offset);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_bytealign_S (w[ 3], w[ 4], offset);
      w[62] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[61] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[60] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[59] = hc_bytealign_S (    0, w[ 0], offset);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_bytealign_S (w[ 2], w[ 3], offset);
      w[62] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[61] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[60] = hc_bytealign_S (    0, w[ 0], offset);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_bytealign_S (w[ 1], w[ 2], offset);
      w[62] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[61] = hc_bytealign_S (    0, w[ 0], offset);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_bytealign_S (w[ 0], w[ 1], offset);
      w[62] = hc_bytealign_S (    0, w[ 0], offset);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_bytealign_S (    0, w[ 0], offset);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif

  #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if defined IS_AMD
  const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
  #endif

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_byte_perm_S (w[63], w[62], selector);
      w[62] = hc_byte_perm_S (w[62], w[61], selector);
      w[61] = hc_byte_perm_S (w[61], w[60], selector);
      w[60] = hc_byte_perm_S (w[60], w[59], selector);
      w[59] = hc_byte_perm_S (w[59], w[58], selector);
      w[58] = hc_byte_perm_S (w[58], w[57], selector);
      w[57] = hc_byte_perm_S (w[57], w[56], selector);
      w[56] = hc_byte_perm_S (w[56], w[55], selector);
      w[55] = hc_byte_perm_S (w[55], w[54], selector);
      w[54] = hc_byte_perm_S (w[54], w[53], selector);
      w[53] = hc_byte_perm_S (w[53], w[52], selector);
      w[52] = hc_byte_perm_S (w[52], w[51], selector);
      w[51] = hc_byte_perm_S (w[51], w[50], selector);
      w[50] = hc_byte_perm_S (w[50], w[49], selector);
      w[49] = hc_byte_perm_S (w[49], w[48], selector);
      w[48] = hc_byte_perm_S (w[48], w[47], selector);
      w[47] = hc_byte_perm_S (w[47], w[46], selector);
      w[46] = hc_byte_perm_S (w[46], w[45], selector);
      w[45] = hc_byte_perm_S (w[45], w[44], selector);
      w[44] = hc_byte_perm_S (w[44], w[43], selector);
      w[43] = hc_byte_perm_S (w[43], w[42], selector);
      w[42] = hc_byte_perm_S (w[42], w[41], selector);
      w[41] = hc_byte_perm_S (w[41], w[40], selector);
      w[40] = hc_byte_perm_S (w[40], w[39], selector);
      w[39] = hc_byte_perm_S (w[39], w[38], selector);
      w[38] = hc_byte_perm_S (w[38], w[37], selector);
      w[37] = hc_byte_perm_S (w[37], w[36], selector);
      w[36] = hc_byte_perm_S (w[36], w[35], selector);
      w[35] = hc_byte_perm_S (w[35], w[34], selector);
      w[34] = hc_byte_perm_S (w[34], w[33], selector);
      w[33] = hc_byte_perm_S (w[33], w[32], selector);
      w[32] = hc_byte_perm_S (w[32], w[31], selector);
      w[31] = hc_byte_perm_S (w[31], w[30], selector);
      w[30] = hc_byte_perm_S (w[30], w[29], selector);
      w[29] = hc_byte_perm_S (w[29], w[28], selector);
      w[28] = hc_byte_perm_S (w[28], w[27], selector);
      w[27] = hc_byte_perm_S (w[27], w[26], selector);
      w[26] = hc_byte_perm_S (w[26], w[25], selector);
      w[25] = hc_byte_perm_S (w[25], w[24], selector);
      w[24] = hc_byte_perm_S (w[24], w[23], selector);
      w[23] = hc_byte_perm_S (w[23], w[22], selector);
      w[22] = hc_byte_perm_S (w[22], w[21], selector);
      w[21] = hc_byte_perm_S (w[21], w[20], selector);
      w[20] = hc_byte_perm_S (w[20], w[19], selector);
      w[19] = hc_byte_perm_S (w[19], w[18], selector);
      w[18] = hc_byte_perm_S (w[18], w[17], selector);
      w[17] = hc_byte_perm_S (w[17], w[16], selector);
      w[16] = hc_byte_perm_S (w[16], w[15], selector);
      w[15] = hc_byte_perm_S (w[15], w[14], selector);
      w[14] = hc_byte_perm_S (w[14], w[13], selector);
      w[13] = hc_byte_perm_S (w[13], w[12], selector);
      w[12] = hc_byte_perm_S (w[12], w[11], selector);
      w[11] = hc_byte_perm_S (w[11], w[10], selector);
      w[10] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[ 9] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[ 8] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[ 7] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[ 6] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[ 5] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 4] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 3] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 2] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 1] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 0] = hc_byte_perm_S (w[ 0],     0, selector);

      break;

    case  1:
      w[63] = hc_byte_perm_S (w[62], w[61], selector);
      w[62] = hc_byte_perm_S (w[61], w[60], selector);
      w[61] = hc_byte_perm_S (w[60], w[59], selector);
      w[60] = hc_byte_perm_S (w[59], w[58], selector);
      w[59] = hc_byte_perm_S (w[58], w[57], selector);
      w[58] = hc_byte_perm_S (w[57], w[56], selector);
      w[57] = hc_byte_perm_S (w[56], w[55], selector);
      w[56] = hc_byte_perm_S (w[55], w[54], selector);
      w[55] = hc_byte_perm_S (w[54], w[53], selector);
      w[54] = hc_byte_perm_S (w[53], w[52], selector);
      w[53] = hc_byte_perm_S (w[52], w[51], selector);
      w[52] = hc_byte_perm_S (w[51], w[50], selector);
      w[51] = hc_byte_perm_S (w[50], w[49], selector);
      w[50] = hc_byte_perm_S (w[49], w[48], selector);
      w[49] = hc_byte_perm_S (w[48], w[47], selector);
      w[48] = hc_byte_perm_S (w[47], w[46], selector);
      w[47] = hc_byte_perm_S (w[46], w[45], selector);
      w[46] = hc_byte_perm_S (w[45], w[44], selector);
      w[45] = hc_byte_perm_S (w[44], w[43], selector);
      w[44] = hc_byte_perm_S (w[43], w[42], selector);
      w[43] = hc_byte_perm_S (w[42], w[41], selector);
      w[42] = hc_byte_perm_S (w[41], w[40], selector);
      w[41] = hc_byte_perm_S (w[40], w[39], selector);
      w[40] = hc_byte_perm_S (w[39], w[38], selector);
      w[39] = hc_byte_perm_S (w[38], w[37], selector);
      w[38] = hc_byte_perm_S (w[37], w[36], selector);
      w[37] = hc_byte_perm_S (w[36], w[35], selector);
      w[36] = hc_byte_perm_S (w[35], w[34], selector);
      w[35] = hc_byte_perm_S (w[34], w[33], selector);
      w[34] = hc_byte_perm_S (w[33], w[32], selector);
      w[33] = hc_byte_perm_S (w[32], w[31], selector);
      w[32] = hc_byte_perm_S (w[31], w[30], selector);
      w[31] = hc_byte_perm_S (w[30], w[29], selector);
      w[30] = hc_byte_perm_S (w[29], w[28], selector);
      w[29] = hc_byte_perm_S (w[28], w[27], selector);
      w[28] = hc_byte_perm_S (w[27], w[26], selector);
      w[27] = hc_byte_perm_S (w[26], w[25], selector);
      w[26] = hc_byte_perm_S (w[25], w[24], selector);
      w[25] = hc_byte_perm_S (w[24], w[23], selector);
      w[24] = hc_byte_perm_S (w[23], w[22], selector);
      w[23] = hc_byte_perm_S (w[22], w[21], selector);
      w[22] = hc_byte_perm_S (w[21], w[20], selector);
      w[21] = hc_byte_perm_S (w[20], w[19], selector);
      w[20] = hc_byte_perm_S (w[19], w[18], selector);
      w[19] = hc_byte_perm_S (w[18], w[17], selector);
      w[18] = hc_byte_perm_S (w[17], w[16], selector);
      w[17] = hc_byte_perm_S (w[16], w[15], selector);
      w[16] = hc_byte_perm_S (w[15], w[14], selector);
      w[15] = hc_byte_perm_S (w[14], w[13], selector);
      w[14] = hc_byte_perm_S (w[13], w[12], selector);
      w[13] = hc_byte_perm_S (w[12], w[11], selector);
      w[12] = hc_byte_perm_S (w[11], w[10], selector);
      w[11] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[10] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[ 9] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[ 8] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[ 7] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[ 6] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 5] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 4] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 3] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 2] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 1] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_byte_perm_S (w[61], w[60], selector);
      w[62] = hc_byte_perm_S (w[60], w[59], selector);
      w[61] = hc_byte_perm_S (w[59], w[58], selector);
      w[60] = hc_byte_perm_S (w[58], w[57], selector);
      w[59] = hc_byte_perm_S (w[57], w[56], selector);
      w[58] = hc_byte_perm_S (w[56], w[55], selector);
      w[57] = hc_byte_perm_S (w[55], w[54], selector);
      w[56] = hc_byte_perm_S (w[54], w[53], selector);
      w[55] = hc_byte_perm_S (w[53], w[52], selector);
      w[54] = hc_byte_perm_S (w[52], w[51], selector);
      w[53] = hc_byte_perm_S (w[51], w[50], selector);
      w[52] = hc_byte_perm_S (w[50], w[49], selector);
      w[51] = hc_byte_perm_S (w[49], w[48], selector);
      w[50] = hc_byte_perm_S (w[48], w[47], selector);
      w[49] = hc_byte_perm_S (w[47], w[46], selector);
      w[48] = hc_byte_perm_S (w[46], w[45], selector);
      w[47] = hc_byte_perm_S (w[45], w[44], selector);
      w[46] = hc_byte_perm_S (w[44], w[43], selector);
      w[45] = hc_byte_perm_S (w[43], w[42], selector);
      w[44] = hc_byte_perm_S (w[42], w[41], selector);
      w[43] = hc_byte_perm_S (w[41], w[40], selector);
      w[42] = hc_byte_perm_S (w[40], w[39], selector);
      w[41] = hc_byte_perm_S (w[39], w[38], selector);
      w[40] = hc_byte_perm_S (w[38], w[37], selector);
      w[39] = hc_byte_perm_S (w[37], w[36], selector);
      w[38] = hc_byte_perm_S (w[36], w[35], selector);
      w[37] = hc_byte_perm_S (w[35], w[34], selector);
      w[36] = hc_byte_perm_S (w[34], w[33], selector);
      w[35] = hc_byte_perm_S (w[33], w[32], selector);
      w[34] = hc_byte_perm_S (w[32], w[31], selector);
      w[33] = hc_byte_perm_S (w[31], w[30], selector);
      w[32] = hc_byte_perm_S (w[30], w[29], selector);
      w[31] = hc_byte_perm_S (w[29], w[28], selector);
      w[30] = hc_byte_perm_S (w[28], w[27], selector);
      w[29] = hc_byte_perm_S (w[27], w[26], selector);
      w[28] = hc_byte_perm_S (w[26], w[25], selector);
      w[27] = hc_byte_perm_S (w[25], w[24], selector);
      w[26] = hc_byte_perm_S (w[24], w[23], selector);
      w[25] = hc_byte_perm_S (w[23], w[22], selector);
      w[24] = hc_byte_perm_S (w[22], w[21], selector);
      w[23] = hc_byte_perm_S (w[21], w[20], selector);
      w[22] = hc_byte_perm_S (w[20], w[19], selector);
      w[21] = hc_byte_perm_S (w[19], w[18], selector);
      w[20] = hc_byte_perm_S (w[18], w[17], selector);
      w[19] = hc_byte_perm_S (w[17], w[16], selector);
      w[18] = hc_byte_perm_S (w[16], w[15], selector);
      w[17] = hc_byte_perm_S (w[15], w[14], selector);
      w[16] = hc_byte_perm_S (w[14], w[13], selector);
      w[15] = hc_byte_perm_S (w[13], w[12], selector);
      w[14] = hc_byte_perm_S (w[12], w[11], selector);
      w[13] = hc_byte_perm_S (w[11], w[10], selector);
      w[12] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[11] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[10] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[ 9] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[ 8] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[ 7] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 6] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 5] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 4] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 3] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 2] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_byte_perm_S (w[60], w[59], selector);
      w[62] = hc_byte_perm_S (w[59], w[58], selector);
      w[61] = hc_byte_perm_S (w[58], w[57], selector);
      w[60] = hc_byte_perm_S (w[57], w[56], selector);
      w[59] = hc_byte_perm_S (w[56], w[55], selector);
      w[58] = hc_byte_perm_S (w[55], w[54], selector);
      w[57] = hc_byte_perm_S (w[54], w[53], selector);
      w[56] = hc_byte_perm_S (w[53], w[52], selector);
      w[55] = hc_byte_perm_S (w[52], w[51], selector);
      w[54] = hc_byte_perm_S (w[51], w[50], selector);
      w[53] = hc_byte_perm_S (w[50], w[49], selector);
      w[52] = hc_byte_perm_S (w[49], w[48], selector);
      w[51] = hc_byte_perm_S (w[48], w[47], selector);
      w[50] = hc_byte_perm_S (w[47], w[46], selector);
      w[49] = hc_byte_perm_S (w[46], w[45], selector);
      w[48] = hc_byte_perm_S (w[45], w[44], selector);
      w[47] = hc_byte_perm_S (w[44], w[43], selector);
      w[46] = hc_byte_perm_S (w[43], w[42], selector);
      w[45] = hc_byte_perm_S (w[42], w[41], selector);
      w[44] = hc_byte_perm_S (w[41], w[40], selector);
      w[43] = hc_byte_perm_S (w[40], w[39], selector);
      w[42] = hc_byte_perm_S (w[39], w[38], selector);
      w[41] = hc_byte_perm_S (w[38], w[37], selector);
      w[40] = hc_byte_perm_S (w[37], w[36], selector);
      w[39] = hc_byte_perm_S (w[36], w[35], selector);
      w[38] = hc_byte_perm_S (w[35], w[34], selector);
      w[37] = hc_byte_perm_S (w[34], w[33], selector);
      w[36] = hc_byte_perm_S (w[33], w[32], selector);
      w[35] = hc_byte_perm_S (w[32], w[31], selector);
      w[34] = hc_byte_perm_S (w[31], w[30], selector);
      w[33] = hc_byte_perm_S (w[30], w[29], selector);
      w[32] = hc_byte_perm_S (w[29], w[28], selector);
      w[31] = hc_byte_perm_S (w[28], w[27], selector);
      w[30] = hc_byte_perm_S (w[27], w[26], selector);
      w[29] = hc_byte_perm_S (w[26], w[25], selector);
      w[28] = hc_byte_perm_S (w[25], w[24], selector);
      w[27] = hc_byte_perm_S (w[24], w[23], selector);
      w[26] = hc_byte_perm_S (w[23], w[22], selector);
      w[25] = hc_byte_perm_S (w[22], w[21], selector);
      w[24] = hc_byte_perm_S (w[21], w[20], selector);
      w[23] = hc_byte_perm_S (w[20], w[19], selector);
      w[22] = hc_byte_perm_S (w[19], w[18], selector);
      w[21] = hc_byte_perm_S (w[18], w[17], selector);
      w[20] = hc_byte_perm_S (w[17], w[16], selector);
      w[19] = hc_byte_perm_S (w[16], w[15], selector);
      w[18] = hc_byte_perm_S (w[15], w[14], selector);
      w[17] = hc_byte_perm_S (w[14], w[13], selector);
      w[16] = hc_byte_perm_S (w[13], w[12], selector);
      w[15] = hc_byte_perm_S (w[12], w[11], selector);
      w[14] = hc_byte_perm_S (w[11], w[10], selector);
      w[13] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[12] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[11] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[10] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[ 9] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[ 8] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 7] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 6] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 5] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 4] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 3] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_byte_perm_S (w[59], w[58], selector);
      w[62] = hc_byte_perm_S (w[58], w[57], selector);
      w[61] = hc_byte_perm_S (w[57], w[56], selector);
      w[60] = hc_byte_perm_S (w[56], w[55], selector);
      w[59] = hc_byte_perm_S (w[55], w[54], selector);
      w[58] = hc_byte_perm_S (w[54], w[53], selector);
      w[57] = hc_byte_perm_S (w[53], w[52], selector);
      w[56] = hc_byte_perm_S (w[52], w[51], selector);
      w[55] = hc_byte_perm_S (w[51], w[50], selector);
      w[54] = hc_byte_perm_S (w[50], w[49], selector);
      w[53] = hc_byte_perm_S (w[49], w[48], selector);
      w[52] = hc_byte_perm_S (w[48], w[47], selector);
      w[51] = hc_byte_perm_S (w[47], w[46], selector);
      w[50] = hc_byte_perm_S (w[46], w[45], selector);
      w[49] = hc_byte_perm_S (w[45], w[44], selector);
      w[48] = hc_byte_perm_S (w[44], w[43], selector);
      w[47] = hc_byte_perm_S (w[43], w[42], selector);
      w[46] = hc_byte_perm_S (w[42], w[41], selector);
      w[45] = hc_byte_perm_S (w[41], w[40], selector);
      w[44] = hc_byte_perm_S (w[40], w[39], selector);
      w[43] = hc_byte_perm_S (w[39], w[38], selector);
      w[42] = hc_byte_perm_S (w[38], w[37], selector);
      w[41] = hc_byte_perm_S (w[37], w[36], selector);
      w[40] = hc_byte_perm_S (w[36], w[35], selector);
      w[39] = hc_byte_perm_S (w[35], w[34], selector);
      w[38] = hc_byte_perm_S (w[34], w[33], selector);
      w[37] = hc_byte_perm_S (w[33], w[32], selector);
      w[36] = hc_byte_perm_S (w[32], w[31], selector);
      w[35] = hc_byte_perm_S (w[31], w[30], selector);
      w[34] = hc_byte_perm_S (w[30], w[29], selector);
      w[33] = hc_byte_perm_S (w[29], w[28], selector);
      w[32] = hc_byte_perm_S (w[28], w[27], selector);
      w[31] = hc_byte_perm_S (w[27], w[26], selector);
      w[30] = hc_byte_perm_S (w[26], w[25], selector);
      w[29] = hc_byte_perm_S (w[25], w[24], selector);
      w[28] = hc_byte_perm_S (w[24], w[23], selector);
      w[27] = hc_byte_perm_S (w[23], w[22], selector);
      w[26] = hc_byte_perm_S (w[22], w[21], selector);
      w[25] = hc_byte_perm_S (w[21], w[20], selector);
      w[24] = hc_byte_perm_S (w[20], w[19], selector);
      w[23] = hc_byte_perm_S (w[19], w[18], selector);
      w[22] = hc_byte_perm_S (w[18], w[17], selector);
      w[21] = hc_byte_perm_S (w[17], w[16], selector);
      w[20] = hc_byte_perm_S (w[16], w[15], selector);
      w[19] = hc_byte_perm_S (w[15], w[14], selector);
      w[18] = hc_byte_perm_S (w[14], w[13], selector);
      w[17] = hc_byte_perm_S (w[13], w[12], selector);
      w[16] = hc_byte_perm_S (w[12], w[11], selector);
      w[15] = hc_byte_perm_S (w[11], w[10], selector);
      w[14] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[13] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[12] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[11] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[10] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[ 9] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 8] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 7] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 6] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 5] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 4] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_byte_perm_S (w[58], w[57], selector);
      w[62] = hc_byte_perm_S (w[57], w[56], selector);
      w[61] = hc_byte_perm_S (w[56], w[55], selector);
      w[60] = hc_byte_perm_S (w[55], w[54], selector);
      w[59] = hc_byte_perm_S (w[54], w[53], selector);
      w[58] = hc_byte_perm_S (w[53], w[52], selector);
      w[57] = hc_byte_perm_S (w[52], w[51], selector);
      w[56] = hc_byte_perm_S (w[51], w[50], selector);
      w[55] = hc_byte_perm_S (w[50], w[49], selector);
      w[54] = hc_byte_perm_S (w[49], w[48], selector);
      w[53] = hc_byte_perm_S (w[48], w[47], selector);
      w[52] = hc_byte_perm_S (w[47], w[46], selector);
      w[51] = hc_byte_perm_S (w[46], w[45], selector);
      w[50] = hc_byte_perm_S (w[45], w[44], selector);
      w[49] = hc_byte_perm_S (w[44], w[43], selector);
      w[48] = hc_byte_perm_S (w[43], w[42], selector);
      w[47] = hc_byte_perm_S (w[42], w[41], selector);
      w[46] = hc_byte_perm_S (w[41], w[40], selector);
      w[45] = hc_byte_perm_S (w[40], w[39], selector);
      w[44] = hc_byte_perm_S (w[39], w[38], selector);
      w[43] = hc_byte_perm_S (w[38], w[37], selector);
      w[42] = hc_byte_perm_S (w[37], w[36], selector);
      w[41] = hc_byte_perm_S (w[36], w[35], selector);
      w[40] = hc_byte_perm_S (w[35], w[34], selector);
      w[39] = hc_byte_perm_S (w[34], w[33], selector);
      w[38] = hc_byte_perm_S (w[33], w[32], selector);
      w[37] = hc_byte_perm_S (w[32], w[31], selector);
      w[36] = hc_byte_perm_S (w[31], w[30], selector);
      w[35] = hc_byte_perm_S (w[30], w[29], selector);
      w[34] = hc_byte_perm_S (w[29], w[28], selector);
      w[33] = hc_byte_perm_S (w[28], w[27], selector);
      w[32] = hc_byte_perm_S (w[27], w[26], selector);
      w[31] = hc_byte_perm_S (w[26], w[25], selector);
      w[30] = hc_byte_perm_S (w[25], w[24], selector);
      w[29] = hc_byte_perm_S (w[24], w[23], selector);
      w[28] = hc_byte_perm_S (w[23], w[22], selector);
      w[27] = hc_byte_perm_S (w[22], w[21], selector);
      w[26] = hc_byte_perm_S (w[21], w[20], selector);
      w[25] = hc_byte_perm_S (w[20], w[19], selector);
      w[24] = hc_byte_perm_S (w[19], w[18], selector);
      w[23] = hc_byte_perm_S (w[18], w[17], selector);
      w[22] = hc_byte_perm_S (w[17], w[16], selector);
      w[21] = hc_byte_perm_S (w[16], w[15], selector);
      w[20] = hc_byte_perm_S (w[15], w[14], selector);
      w[19] = hc_byte_perm_S (w[14], w[13], selector);
      w[18] = hc_byte_perm_S (w[13], w[12], selector);
      w[17] = hc_byte_perm_S (w[12], w[11], selector);
      w[16] = hc_byte_perm_S (w[11], w[10], selector);
      w[15] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[14] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[13] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[12] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[11] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[10] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[ 9] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 8] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 7] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 6] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 5] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_byte_perm_S (w[57], w[56], selector);
      w[62] = hc_byte_perm_S (w[56], w[55], selector);
      w[61] = hc_byte_perm_S (w[55], w[54], selector);
      w[60] = hc_byte_perm_S (w[54], w[53], selector);
      w[59] = hc_byte_perm_S (w[53], w[52], selector);
      w[58] = hc_byte_perm_S (w[52], w[51], selector);
      w[57] = hc_byte_perm_S (w[51], w[50], selector);
      w[56] = hc_byte_perm_S (w[50], w[49], selector);
      w[55] = hc_byte_perm_S (w[49], w[48], selector);
      w[54] = hc_byte_perm_S (w[48], w[47], selector);
      w[53] = hc_byte_perm_S (w[47], w[46], selector);
      w[52] = hc_byte_perm_S (w[46], w[45], selector);
      w[51] = hc_byte_perm_S (w[45], w[44], selector);
      w[50] = hc_byte_perm_S (w[44], w[43], selector);
      w[49] = hc_byte_perm_S (w[43], w[42], selector);
      w[48] = hc_byte_perm_S (w[42], w[41], selector);
      w[47] = hc_byte_perm_S (w[41], w[40], selector);
      w[46] = hc_byte_perm_S (w[40], w[39], selector);
      w[45] = hc_byte_perm_S (w[39], w[38], selector);
      w[44] = hc_byte_perm_S (w[38], w[37], selector);
      w[43] = hc_byte_perm_S (w[37], w[36], selector);
      w[42] = hc_byte_perm_S (w[36], w[35], selector);
      w[41] = hc_byte_perm_S (w[35], w[34], selector);
      w[40] = hc_byte_perm_S (w[34], w[33], selector);
      w[39] = hc_byte_perm_S (w[33], w[32], selector);
      w[38] = hc_byte_perm_S (w[32], w[31], selector);
      w[37] = hc_byte_perm_S (w[31], w[30], selector);
      w[36] = hc_byte_perm_S (w[30], w[29], selector);
      w[35] = hc_byte_perm_S (w[29], w[28], selector);
      w[34] = hc_byte_perm_S (w[28], w[27], selector);
      w[33] = hc_byte_perm_S (w[27], w[26], selector);
      w[32] = hc_byte_perm_S (w[26], w[25], selector);
      w[31] = hc_byte_perm_S (w[25], w[24], selector);
      w[30] = hc_byte_perm_S (w[24], w[23], selector);
      w[29] = hc_byte_perm_S (w[23], w[22], selector);
      w[28] = hc_byte_perm_S (w[22], w[21], selector);
      w[27] = hc_byte_perm_S (w[21], w[20], selector);
      w[26] = hc_byte_perm_S (w[20], w[19], selector);
      w[25] = hc_byte_perm_S (w[19], w[18], selector);
      w[24] = hc_byte_perm_S (w[18], w[17], selector);
      w[23] = hc_byte_perm_S (w[17], w[16], selector);
      w[22] = hc_byte_perm_S (w[16], w[15], selector);
      w[21] = hc_byte_perm_S (w[15], w[14], selector);
      w[20] = hc_byte_perm_S (w[14], w[13], selector);
      w[19] = hc_byte_perm_S (w[13], w[12], selector);
      w[18] = hc_byte_perm_S (w[12], w[11], selector);
      w[17] = hc_byte_perm_S (w[11], w[10], selector);
      w[16] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[15] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[14] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[13] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[12] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[11] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[10] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[ 9] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 8] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 7] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 6] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_byte_perm_S (w[56], w[55], selector);
      w[62] = hc_byte_perm_S (w[55], w[54], selector);
      w[61] = hc_byte_perm_S (w[54], w[53], selector);
      w[60] = hc_byte_perm_S (w[53], w[52], selector);
      w[59] = hc_byte_perm_S (w[52], w[51], selector);
      w[58] = hc_byte_perm_S (w[51], w[50], selector);
      w[57] = hc_byte_perm_S (w[50], w[49], selector);
      w[56] = hc_byte_perm_S (w[49], w[48], selector);
      w[55] = hc_byte_perm_S (w[48], w[47], selector);
      w[54] = hc_byte_perm_S (w[47], w[46], selector);
      w[53] = hc_byte_perm_S (w[46], w[45], selector);
      w[52] = hc_byte_perm_S (w[45], w[44], selector);
      w[51] = hc_byte_perm_S (w[44], w[43], selector);
      w[50] = hc_byte_perm_S (w[43], w[42], selector);
      w[49] = hc_byte_perm_S (w[42], w[41], selector);
      w[48] = hc_byte_perm_S (w[41], w[40], selector);
      w[47] = hc_byte_perm_S (w[40], w[39], selector);
      w[46] = hc_byte_perm_S (w[39], w[38], selector);
      w[45] = hc_byte_perm_S (w[38], w[37], selector);
      w[44] = hc_byte_perm_S (w[37], w[36], selector);
      w[43] = hc_byte_perm_S (w[36], w[35], selector);
      w[42] = hc_byte_perm_S (w[35], w[34], selector);
      w[41] = hc_byte_perm_S (w[34], w[33], selector);
      w[40] = hc_byte_perm_S (w[33], w[32], selector);
      w[39] = hc_byte_perm_S (w[32], w[31], selector);
      w[38] = hc_byte_perm_S (w[31], w[30], selector);
      w[37] = hc_byte_perm_S (w[30], w[29], selector);
      w[36] = hc_byte_perm_S (w[29], w[28], selector);
      w[35] = hc_byte_perm_S (w[28], w[27], selector);
      w[34] = hc_byte_perm_S (w[27], w[26], selector);
      w[33] = hc_byte_perm_S (w[26], w[25], selector);
      w[32] = hc_byte_perm_S (w[25], w[24], selector);
      w[31] = hc_byte_perm_S (w[24], w[23], selector);
      w[30] = hc_byte_perm_S (w[23], w[22], selector);
      w[29] = hc_byte_perm_S (w[22], w[21], selector);
      w[28] = hc_byte_perm_S (w[21], w[20], selector);
      w[27] = hc_byte_perm_S (w[20], w[19], selector);
      w[26] = hc_byte_perm_S (w[19], w[18], selector);
      w[25] = hc_byte_perm_S (w[18], w[17], selector);
      w[24] = hc_byte_perm_S (w[17], w[16], selector);
      w[23] = hc_byte_perm_S (w[16], w[15], selector);
      w[22] = hc_byte_perm_S (w[15], w[14], selector);
      w[21] = hc_byte_perm_S (w[14], w[13], selector);
      w[20] = hc_byte_perm_S (w[13], w[12], selector);
      w[19] = hc_byte_perm_S (w[12], w[11], selector);
      w[18] = hc_byte_perm_S (w[11], w[10], selector);
      w[17] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[16] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[15] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[14] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[13] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[12] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[11] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[10] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[ 9] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 8] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 7] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_byte_perm_S (w[55], w[54], selector);
      w[62] = hc_byte_perm_S (w[54], w[53], selector);
      w[61] = hc_byte_perm_S (w[53], w[52], selector);
      w[60] = hc_byte_perm_S (w[52], w[51], selector);
      w[59] = hc_byte_perm_S (w[51], w[50], selector);
      w[58] = hc_byte_perm_S (w[50], w[49], selector);
      w[57] = hc_byte_perm_S (w[49], w[48], selector);
      w[56] = hc_byte_perm_S (w[48], w[47], selector);
      w[55] = hc_byte_perm_S (w[47], w[46], selector);
      w[54] = hc_byte_perm_S (w[46], w[45], selector);
      w[53] = hc_byte_perm_S (w[45], w[44], selector);
      w[52] = hc_byte_perm_S (w[44], w[43], selector);
      w[51] = hc_byte_perm_S (w[43], w[42], selector);
      w[50] = hc_byte_perm_S (w[42], w[41], selector);
      w[49] = hc_byte_perm_S (w[41], w[40], selector);
      w[48] = hc_byte_perm_S (w[40], w[39], selector);
      w[47] = hc_byte_perm_S (w[39], w[38], selector);
      w[46] = hc_byte_perm_S (w[38], w[37], selector);
      w[45] = hc_byte_perm_S (w[37], w[36], selector);
      w[44] = hc_byte_perm_S (w[36], w[35], selector);
      w[43] = hc_byte_perm_S (w[35], w[34], selector);
      w[42] = hc_byte_perm_S (w[34], w[33], selector);
      w[41] = hc_byte_perm_S (w[33], w[32], selector);
      w[40] = hc_byte_perm_S (w[32], w[31], selector);
      w[39] = hc_byte_perm_S (w[31], w[30], selector);
      w[38] = hc_byte_perm_S (w[30], w[29], selector);
      w[37] = hc_byte_perm_S (w[29], w[28], selector);
      w[36] = hc_byte_perm_S (w[28], w[27], selector);
      w[35] = hc_byte_perm_S (w[27], w[26], selector);
      w[34] = hc_byte_perm_S (w[26], w[25], selector);
      w[33] = hc_byte_perm_S (w[25], w[24], selector);
      w[32] = hc_byte_perm_S (w[24], w[23], selector);
      w[31] = hc_byte_perm_S (w[23], w[22], selector);
      w[30] = hc_byte_perm_S (w[22], w[21], selector);
      w[29] = hc_byte_perm_S (w[21], w[20], selector);
      w[28] = hc_byte_perm_S (w[20], w[19], selector);
      w[27] = hc_byte_perm_S (w[19], w[18], selector);
      w[26] = hc_byte_perm_S (w[18], w[17], selector);
      w[25] = hc_byte_perm_S (w[17], w[16], selector);
      w[24] = hc_byte_perm_S (w[16], w[15], selector);
      w[23] = hc_byte_perm_S (w[15], w[14], selector);
      w[22] = hc_byte_perm_S (w[14], w[13], selector);
      w[21] = hc_byte_perm_S (w[13], w[12], selector);
      w[20] = hc_byte_perm_S (w[12], w[11], selector);
      w[19] = hc_byte_perm_S (w[11], w[10], selector);
      w[18] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[17] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[16] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[15] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[14] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[13] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[12] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[11] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[10] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[ 9] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 8] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_byte_perm_S (w[54], w[53], selector);
      w[62] = hc_byte_perm_S (w[53], w[52], selector);
      w[61] = hc_byte_perm_S (w[52], w[51], selector);
      w[60] = hc_byte_perm_S (w[51], w[50], selector);
      w[59] = hc_byte_perm_S (w[50], w[49], selector);
      w[58] = hc_byte_perm_S (w[49], w[48], selector);
      w[57] = hc_byte_perm_S (w[48], w[47], selector);
      w[56] = hc_byte_perm_S (w[47], w[46], selector);
      w[55] = hc_byte_perm_S (w[46], w[45], selector);
      w[54] = hc_byte_perm_S (w[45], w[44], selector);
      w[53] = hc_byte_perm_S (w[44], w[43], selector);
      w[52] = hc_byte_perm_S (w[43], w[42], selector);
      w[51] = hc_byte_perm_S (w[42], w[41], selector);
      w[50] = hc_byte_perm_S (w[41], w[40], selector);
      w[49] = hc_byte_perm_S (w[40], w[39], selector);
      w[48] = hc_byte_perm_S (w[39], w[38], selector);
      w[47] = hc_byte_perm_S (w[38], w[37], selector);
      w[46] = hc_byte_perm_S (w[37], w[36], selector);
      w[45] = hc_byte_perm_S (w[36], w[35], selector);
      w[44] = hc_byte_perm_S (w[35], w[34], selector);
      w[43] = hc_byte_perm_S (w[34], w[33], selector);
      w[42] = hc_byte_perm_S (w[33], w[32], selector);
      w[41] = hc_byte_perm_S (w[32], w[31], selector);
      w[40] = hc_byte_perm_S (w[31], w[30], selector);
      w[39] = hc_byte_perm_S (w[30], w[29], selector);
      w[38] = hc_byte_perm_S (w[29], w[28], selector);
      w[37] = hc_byte_perm_S (w[28], w[27], selector);
      w[36] = hc_byte_perm_S (w[27], w[26], selector);
      w[35] = hc_byte_perm_S (w[26], w[25], selector);
      w[34] = hc_byte_perm_S (w[25], w[24], selector);
      w[33] = hc_byte_perm_S (w[24], w[23], selector);
      w[32] = hc_byte_perm_S (w[23], w[22], selector);
      w[31] = hc_byte_perm_S (w[22], w[21], selector);
      w[30] = hc_byte_perm_S (w[21], w[20], selector);
      w[29] = hc_byte_perm_S (w[20], w[19], selector);
      w[28] = hc_byte_perm_S (w[19], w[18], selector);
      w[27] = hc_byte_perm_S (w[18], w[17], selector);
      w[26] = hc_byte_perm_S (w[17], w[16], selector);
      w[25] = hc_byte_perm_S (w[16], w[15], selector);
      w[24] = hc_byte_perm_S (w[15], w[14], selector);
      w[23] = hc_byte_perm_S (w[14], w[13], selector);
      w[22] = hc_byte_perm_S (w[13], w[12], selector);
      w[21] = hc_byte_perm_S (w[12], w[11], selector);
      w[20] = hc_byte_perm_S (w[11], w[10], selector);
      w[19] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[18] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[17] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[16] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[15] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[14] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[13] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[12] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[11] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[10] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[ 9] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_byte_perm_S (w[53], w[52], selector);
      w[62] = hc_byte_perm_S (w[52], w[51], selector);
      w[61] = hc_byte_perm_S (w[51], w[50], selector);
      w[60] = hc_byte_perm_S (w[50], w[49], selector);
      w[59] = hc_byte_perm_S (w[49], w[48], selector);
      w[58] = hc_byte_perm_S (w[48], w[47], selector);
      w[57] = hc_byte_perm_S (w[47], w[46], selector);
      w[56] = hc_byte_perm_S (w[46], w[45], selector);
      w[55] = hc_byte_perm_S (w[45], w[44], selector);
      w[54] = hc_byte_perm_S (w[44], w[43], selector);
      w[53] = hc_byte_perm_S (w[43], w[42], selector);
      w[52] = hc_byte_perm_S (w[42], w[41], selector);
      w[51] = hc_byte_perm_S (w[41], w[40], selector);
      w[50] = hc_byte_perm_S (w[40], w[39], selector);
      w[49] = hc_byte_perm_S (w[39], w[38], selector);
      w[48] = hc_byte_perm_S (w[38], w[37], selector);
      w[47] = hc_byte_perm_S (w[37], w[36], selector);
      w[46] = hc_byte_perm_S (w[36], w[35], selector);
      w[45] = hc_byte_perm_S (w[35], w[34], selector);
      w[44] = hc_byte_perm_S (w[34], w[33], selector);
      w[43] = hc_byte_perm_S (w[33], w[32], selector);
      w[42] = hc_byte_perm_S (w[32], w[31], selector);
      w[41] = hc_byte_perm_S (w[31], w[30], selector);
      w[40] = hc_byte_perm_S (w[30], w[29], selector);
      w[39] = hc_byte_perm_S (w[29], w[28], selector);
      w[38] = hc_byte_perm_S (w[28], w[27], selector);
      w[37] = hc_byte_perm_S (w[27], w[26], selector);
      w[36] = hc_byte_perm_S (w[26], w[25], selector);
      w[35] = hc_byte_perm_S (w[25], w[24], selector);
      w[34] = hc_byte_perm_S (w[24], w[23], selector);
      w[33] = hc_byte_perm_S (w[23], w[22], selector);
      w[32] = hc_byte_perm_S (w[22], w[21], selector);
      w[31] = hc_byte_perm_S (w[21], w[20], selector);
      w[30] = hc_byte_perm_S (w[20], w[19], selector);
      w[29] = hc_byte_perm_S (w[19], w[18], selector);
      w[28] = hc_byte_perm_S (w[18], w[17], selector);
      w[27] = hc_byte_perm_S (w[17], w[16], selector);
      w[26] = hc_byte_perm_S (w[16], w[15], selector);
      w[25] = hc_byte_perm_S (w[15], w[14], selector);
      w[24] = hc_byte_perm_S (w[14], w[13], selector);
      w[23] = hc_byte_perm_S (w[13], w[12], selector);
      w[22] = hc_byte_perm_S (w[12], w[11], selector);
      w[21] = hc_byte_perm_S (w[11], w[10], selector);
      w[20] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[19] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[18] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[17] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[16] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[15] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[14] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[13] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[12] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[11] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[10] = hc_byte_perm_S (w[ 0],     0, selector);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_byte_perm_S (w[52], w[51], selector);
      w[62] = hc_byte_perm_S (w[51], w[50], selector);
      w[61] = hc_byte_perm_S (w[50], w[49], selector);
      w[60] = hc_byte_perm_S (w[49], w[48], selector);
      w[59] = hc_byte_perm_S (w[48], w[47], selector);
      w[58] = hc_byte_perm_S (w[47], w[46], selector);
      w[57] = hc_byte_perm_S (w[46], w[45], selector);
      w[56] = hc_byte_perm_S (w[45], w[44], selector);
      w[55] = hc_byte_perm_S (w[44], w[43], selector);
      w[54] = hc_byte_perm_S (w[43], w[42], selector);
      w[53] = hc_byte_perm_S (w[42], w[41], selector);
      w[52] = hc_byte_perm_S (w[41], w[40], selector);
      w[51] = hc_byte_perm_S (w[40], w[39], selector);
      w[50] = hc_byte_perm_S (w[39], w[38], selector);
      w[49] = hc_byte_perm_S (w[38], w[37], selector);
      w[48] = hc_byte_perm_S (w[37], w[36], selector);
      w[47] = hc_byte_perm_S (w[36], w[35], selector);
      w[46] = hc_byte_perm_S (w[35], w[34], selector);
      w[45] = hc_byte_perm_S (w[34], w[33], selector);
      w[44] = hc_byte_perm_S (w[33], w[32], selector);
      w[43] = hc_byte_perm_S (w[32], w[31], selector);
      w[42] = hc_byte_perm_S (w[31], w[30], selector);
      w[41] = hc_byte_perm_S (w[30], w[29], selector);
      w[40] = hc_byte_perm_S (w[29], w[28], selector);
      w[39] = hc_byte_perm_S (w[28], w[27], selector);
      w[38] = hc_byte_perm_S (w[27], w[26], selector);
      w[37] = hc_byte_perm_S (w[26], w[25], selector);
      w[36] = hc_byte_perm_S (w[25], w[24], selector);
      w[35] = hc_byte_perm_S (w[24], w[23], selector);
      w[34] = hc_byte_perm_S (w[23], w[22], selector);
      w[33] = hc_byte_perm_S (w[22], w[21], selector);
      w[32] = hc_byte_perm_S (w[21], w[20], selector);
      w[31] = hc_byte_perm_S (w[20], w[19], selector);
      w[30] = hc_byte_perm_S (w[19], w[18], selector);
      w[29] = hc_byte_perm_S (w[18], w[17], selector);
      w[28] = hc_byte_perm_S (w[17], w[16], selector);
      w[27] = hc_byte_perm_S (w[16], w[15], selector);
      w[26] = hc_byte_perm_S (w[15], w[14], selector);
      w[25] = hc_byte_perm_S (w[14], w[13], selector);
      w[24] = hc_byte_perm_S (w[13], w[12], selector);
      w[23] = hc_byte_perm_S (w[12], w[11], selector);
      w[22] = hc_byte_perm_S (w[11], w[10], selector);
      w[21] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[20] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[19] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[18] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[17] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[16] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[15] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[14] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[13] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[12] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[11] = hc_byte_perm_S (w[ 0],     0, selector);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_byte_perm_S (w[51], w[50], selector);
      w[62] = hc_byte_perm_S (w[50], w[49], selector);
      w[61] = hc_byte_perm_S (w[49], w[48], selector);
      w[60] = hc_byte_perm_S (w[48], w[47], selector);
      w[59] = hc_byte_perm_S (w[47], w[46], selector);
      w[58] = hc_byte_perm_S (w[46], w[45], selector);
      w[57] = hc_byte_perm_S (w[45], w[44], selector);
      w[56] = hc_byte_perm_S (w[44], w[43], selector);
      w[55] = hc_byte_perm_S (w[43], w[42], selector);
      w[54] = hc_byte_perm_S (w[42], w[41], selector);
      w[53] = hc_byte_perm_S (w[41], w[40], selector);
      w[52] = hc_byte_perm_S (w[40], w[39], selector);
      w[51] = hc_byte_perm_S (w[39], w[38], selector);
      w[50] = hc_byte_perm_S (w[38], w[37], selector);
      w[49] = hc_byte_perm_S (w[37], w[36], selector);
      w[48] = hc_byte_perm_S (w[36], w[35], selector);
      w[47] = hc_byte_perm_S (w[35], w[34], selector);
      w[46] = hc_byte_perm_S (w[34], w[33], selector);
      w[45] = hc_byte_perm_S (w[33], w[32], selector);
      w[44] = hc_byte_perm_S (w[32], w[31], selector);
      w[43] = hc_byte_perm_S (w[31], w[30], selector);
      w[42] = hc_byte_perm_S (w[30], w[29], selector);
      w[41] = hc_byte_perm_S (w[29], w[28], selector);
      w[40] = hc_byte_perm_S (w[28], w[27], selector);
      w[39] = hc_byte_perm_S (w[27], w[26], selector);
      w[38] = hc_byte_perm_S (w[26], w[25], selector);
      w[37] = hc_byte_perm_S (w[25], w[24], selector);
      w[36] = hc_byte_perm_S (w[24], w[23], selector);
      w[35] = hc_byte_perm_S (w[23], w[22], selector);
      w[34] = hc_byte_perm_S (w[22], w[21], selector);
      w[33] = hc_byte_perm_S (w[21], w[20], selector);
      w[32] = hc_byte_perm_S (w[20], w[19], selector);
      w[31] = hc_byte_perm_S (w[19], w[18], selector);
      w[30] = hc_byte_perm_S (w[18], w[17], selector);
      w[29] = hc_byte_perm_S (w[17], w[16], selector);
      w[28] = hc_byte_perm_S (w[16], w[15], selector);
      w[27] = hc_byte_perm_S (w[15], w[14], selector);
      w[26] = hc_byte_perm_S (w[14], w[13], selector);
      w[25] = hc_byte_perm_S (w[13], w[12], selector);
      w[24] = hc_byte_perm_S (w[12], w[11], selector);
      w[23] = hc_byte_perm_S (w[11], w[10], selector);
      w[22] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[21] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[20] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[19] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[18] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[17] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[16] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[15] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[14] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[13] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[12] = hc_byte_perm_S (w[ 0],     0, selector);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_byte_perm_S (w[50], w[49], selector);
      w[62] = hc_byte_perm_S (w[49], w[48], selector);
      w[61] = hc_byte_perm_S (w[48], w[47], selector);
      w[60] = hc_byte_perm_S (w[47], w[46], selector);
      w[59] = hc_byte_perm_S (w[46], w[45], selector);
      w[58] = hc_byte_perm_S (w[45], w[44], selector);
      w[57] = hc_byte_perm_S (w[44], w[43], selector);
      w[56] = hc_byte_perm_S (w[43], w[42], selector);
      w[55] = hc_byte_perm_S (w[42], w[41], selector);
      w[54] = hc_byte_perm_S (w[41], w[40], selector);
      w[53] = hc_byte_perm_S (w[40], w[39], selector);
      w[52] = hc_byte_perm_S (w[39], w[38], selector);
      w[51] = hc_byte_perm_S (w[38], w[37], selector);
      w[50] = hc_byte_perm_S (w[37], w[36], selector);
      w[49] = hc_byte_perm_S (w[36], w[35], selector);
      w[48] = hc_byte_perm_S (w[35], w[34], selector);
      w[47] = hc_byte_perm_S (w[34], w[33], selector);
      w[46] = hc_byte_perm_S (w[33], w[32], selector);
      w[45] = hc_byte_perm_S (w[32], w[31], selector);
      w[44] = hc_byte_perm_S (w[31], w[30], selector);
      w[43] = hc_byte_perm_S (w[30], w[29], selector);
      w[42] = hc_byte_perm_S (w[29], w[28], selector);
      w[41] = hc_byte_perm_S (w[28], w[27], selector);
      w[40] = hc_byte_perm_S (w[27], w[26], selector);
      w[39] = hc_byte_perm_S (w[26], w[25], selector);
      w[38] = hc_byte_perm_S (w[25], w[24], selector);
      w[37] = hc_byte_perm_S (w[24], w[23], selector);
      w[36] = hc_byte_perm_S (w[23], w[22], selector);
      w[35] = hc_byte_perm_S (w[22], w[21], selector);
      w[34] = hc_byte_perm_S (w[21], w[20], selector);
      w[33] = hc_byte_perm_S (w[20], w[19], selector);
      w[32] = hc_byte_perm_S (w[19], w[18], selector);
      w[31] = hc_byte_perm_S (w[18], w[17], selector);
      w[30] = hc_byte_perm_S (w[17], w[16], selector);
      w[29] = hc_byte_perm_S (w[16], w[15], selector);
      w[28] = hc_byte_perm_S (w[15], w[14], selector);
      w[27] = hc_byte_perm_S (w[14], w[13], selector);
      w[26] = hc_byte_perm_S (w[13], w[12], selector);
      w[25] = hc_byte_perm_S (w[12], w[11], selector);
      w[24] = hc_byte_perm_S (w[11], w[10], selector);
      w[23] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[22] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[21] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[20] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[19] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[18] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[17] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[16] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[15] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[14] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[13] = hc_byte_perm_S (w[ 0],     0, selector);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_byte_perm_S (w[49], w[48], selector);
      w[62] = hc_byte_perm_S (w[48], w[47], selector);
      w[61] = hc_byte_perm_S (w[47], w[46], selector);
      w[60] = hc_byte_perm_S (w[46], w[45], selector);
      w[59] = hc_byte_perm_S (w[45], w[44], selector);
      w[58] = hc_byte_perm_S (w[44], w[43], selector);
      w[57] = hc_byte_perm_S (w[43], w[42], selector);
      w[56] = hc_byte_perm_S (w[42], w[41], selector);
      w[55] = hc_byte_perm_S (w[41], w[40], selector);
      w[54] = hc_byte_perm_S (w[40], w[39], selector);
      w[53] = hc_byte_perm_S (w[39], w[38], selector);
      w[52] = hc_byte_perm_S (w[38], w[37], selector);
      w[51] = hc_byte_perm_S (w[37], w[36], selector);
      w[50] = hc_byte_perm_S (w[36], w[35], selector);
      w[49] = hc_byte_perm_S (w[35], w[34], selector);
      w[48] = hc_byte_perm_S (w[34], w[33], selector);
      w[47] = hc_byte_perm_S (w[33], w[32], selector);
      w[46] = hc_byte_perm_S (w[32], w[31], selector);
      w[45] = hc_byte_perm_S (w[31], w[30], selector);
      w[44] = hc_byte_perm_S (w[30], w[29], selector);
      w[43] = hc_byte_perm_S (w[29], w[28], selector);
      w[42] = hc_byte_perm_S (w[28], w[27], selector);
      w[41] = hc_byte_perm_S (w[27], w[26], selector);
      w[40] = hc_byte_perm_S (w[26], w[25], selector);
      w[39] = hc_byte_perm_S (w[25], w[24], selector);
      w[38] = hc_byte_perm_S (w[24], w[23], selector);
      w[37] = hc_byte_perm_S (w[23], w[22], selector);
      w[36] = hc_byte_perm_S (w[22], w[21], selector);
      w[35] = hc_byte_perm_S (w[21], w[20], selector);
      w[34] = hc_byte_perm_S (w[20], w[19], selector);
      w[33] = hc_byte_perm_S (w[19], w[18], selector);
      w[32] = hc_byte_perm_S (w[18], w[17], selector);
      w[31] = hc_byte_perm_S (w[17], w[16], selector);
      w[30] = hc_byte_perm_S (w[16], w[15], selector);
      w[29] = hc_byte_perm_S (w[15], w[14], selector);
      w[28] = hc_byte_perm_S (w[14], w[13], selector);
      w[27] = hc_byte_perm_S (w[13], w[12], selector);
      w[26] = hc_byte_perm_S (w[12], w[11], selector);
      w[25] = hc_byte_perm_S (w[11], w[10], selector);
      w[24] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[23] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[22] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[21] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[20] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[19] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[18] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[17] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[16] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[15] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[14] = hc_byte_perm_S (w[ 0],     0, selector);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_byte_perm_S (w[48], w[47], selector);
      w[62] = hc_byte_perm_S (w[47], w[46], selector);
      w[61] = hc_byte_perm_S (w[46], w[45], selector);
      w[60] = hc_byte_perm_S (w[45], w[44], selector);
      w[59] = hc_byte_perm_S (w[44], w[43], selector);
      w[58] = hc_byte_perm_S (w[43], w[42], selector);
      w[57] = hc_byte_perm_S (w[42], w[41], selector);
      w[56] = hc_byte_perm_S (w[41], w[40], selector);
      w[55] = hc_byte_perm_S (w[40], w[39], selector);
      w[54] = hc_byte_perm_S (w[39], w[38], selector);
      w[53] = hc_byte_perm_S (w[38], w[37], selector);
      w[52] = hc_byte_perm_S (w[37], w[36], selector);
      w[51] = hc_byte_perm_S (w[36], w[35], selector);
      w[50] = hc_byte_perm_S (w[35], w[34], selector);
      w[49] = hc_byte_perm_S (w[34], w[33], selector);
      w[48] = hc_byte_perm_S (w[33], w[32], selector);
      w[47] = hc_byte_perm_S (w[32], w[31], selector);
      w[46] = hc_byte_perm_S (w[31], w[30], selector);
      w[45] = hc_byte_perm_S (w[30], w[29], selector);
      w[44] = hc_byte_perm_S (w[29], w[28], selector);
      w[43] = hc_byte_perm_S (w[28], w[27], selector);
      w[42] = hc_byte_perm_S (w[27], w[26], selector);
      w[41] = hc_byte_perm_S (w[26], w[25], selector);
      w[40] = hc_byte_perm_S (w[25], w[24], selector);
      w[39] = hc_byte_perm_S (w[24], w[23], selector);
      w[38] = hc_byte_perm_S (w[23], w[22], selector);
      w[37] = hc_byte_perm_S (w[22], w[21], selector);
      w[36] = hc_byte_perm_S (w[21], w[20], selector);
      w[35] = hc_byte_perm_S (w[20], w[19], selector);
      w[34] = hc_byte_perm_S (w[19], w[18], selector);
      w[33] = hc_byte_perm_S (w[18], w[17], selector);
      w[32] = hc_byte_perm_S (w[17], w[16], selector);
      w[31] = hc_byte_perm_S (w[16], w[15], selector);
      w[30] = hc_byte_perm_S (w[15], w[14], selector);
      w[29] = hc_byte_perm_S (w[14], w[13], selector);
      w[28] = hc_byte_perm_S (w[13], w[12], selector);
      w[27] = hc_byte_perm_S (w[12], w[11], selector);
      w[26] = hc_byte_perm_S (w[11], w[10], selector);
      w[25] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[24] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[23] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[22] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[21] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[20] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[19] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[18] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[17] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[16] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[15] = hc_byte_perm_S (w[ 0],     0, selector);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_byte_perm_S (w[47], w[46], selector);
      w[62] = hc_byte_perm_S (w[46], w[45], selector);
      w[61] = hc_byte_perm_S (w[45], w[44], selector);
      w[60] = hc_byte_perm_S (w[44], w[43], selector);
      w[59] = hc_byte_perm_S (w[43], w[42], selector);
      w[58] = hc_byte_perm_S (w[42], w[41], selector);
      w[57] = hc_byte_perm_S (w[41], w[40], selector);
      w[56] = hc_byte_perm_S (w[40], w[39], selector);
      w[55] = hc_byte_perm_S (w[39], w[38], selector);
      w[54] = hc_byte_perm_S (w[38], w[37], selector);
      w[53] = hc_byte_perm_S (w[37], w[36], selector);
      w[52] = hc_byte_perm_S (w[36], w[35], selector);
      w[51] = hc_byte_perm_S (w[35], w[34], selector);
      w[50] = hc_byte_perm_S (w[34], w[33], selector);
      w[49] = hc_byte_perm_S (w[33], w[32], selector);
      w[48] = hc_byte_perm_S (w[32], w[31], selector);
      w[47] = hc_byte_perm_S (w[31], w[30], selector);
      w[46] = hc_byte_perm_S (w[30], w[29], selector);
      w[45] = hc_byte_perm_S (w[29], w[28], selector);
      w[44] = hc_byte_perm_S (w[28], w[27], selector);
      w[43] = hc_byte_perm_S (w[27], w[26], selector);
      w[42] = hc_byte_perm_S (w[26], w[25], selector);
      w[41] = hc_byte_perm_S (w[25], w[24], selector);
      w[40] = hc_byte_perm_S (w[24], w[23], selector);
      w[39] = hc_byte_perm_S (w[23], w[22], selector);
      w[38] = hc_byte_perm_S (w[22], w[21], selector);
      w[37] = hc_byte_perm_S (w[21], w[20], selector);
      w[36] = hc_byte_perm_S (w[20], w[19], selector);
      w[35] = hc_byte_perm_S (w[19], w[18], selector);
      w[34] = hc_byte_perm_S (w[18], w[17], selector);
      w[33] = hc_byte_perm_S (w[17], w[16], selector);
      w[32] = hc_byte_perm_S (w[16], w[15], selector);
      w[31] = hc_byte_perm_S (w[15], w[14], selector);
      w[30] = hc_byte_perm_S (w[14], w[13], selector);
      w[29] = hc_byte_perm_S (w[13], w[12], selector);
      w[28] = hc_byte_perm_S (w[12], w[11], selector);
      w[27] = hc_byte_perm_S (w[11], w[10], selector);
      w[26] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[25] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[24] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[23] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[22] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[21] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[20] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[19] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[18] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[17] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[16] = hc_byte_perm_S (w[ 0],     0, selector);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_byte_perm_S (w[46], w[45], selector);
      w[62] = hc_byte_perm_S (w[45], w[44], selector);
      w[61] = hc_byte_perm_S (w[44], w[43], selector);
      w[60] = hc_byte_perm_S (w[43], w[42], selector);
      w[59] = hc_byte_perm_S (w[42], w[41], selector);
      w[58] = hc_byte_perm_S (w[41], w[40], selector);
      w[57] = hc_byte_perm_S (w[40], w[39], selector);
      w[56] = hc_byte_perm_S (w[39], w[38], selector);
      w[55] = hc_byte_perm_S (w[38], w[37], selector);
      w[54] = hc_byte_perm_S (w[37], w[36], selector);
      w[53] = hc_byte_perm_S (w[36], w[35], selector);
      w[52] = hc_byte_perm_S (w[35], w[34], selector);
      w[51] = hc_byte_perm_S (w[34], w[33], selector);
      w[50] = hc_byte_perm_S (w[33], w[32], selector);
      w[49] = hc_byte_perm_S (w[32], w[31], selector);
      w[48] = hc_byte_perm_S (w[31], w[30], selector);
      w[47] = hc_byte_perm_S (w[30], w[29], selector);
      w[46] = hc_byte_perm_S (w[29], w[28], selector);
      w[45] = hc_byte_perm_S (w[28], w[27], selector);
      w[44] = hc_byte_perm_S (w[27], w[26], selector);
      w[43] = hc_byte_perm_S (w[26], w[25], selector);
      w[42] = hc_byte_perm_S (w[25], w[24], selector);
      w[41] = hc_byte_perm_S (w[24], w[23], selector);
      w[40] = hc_byte_perm_S (w[23], w[22], selector);
      w[39] = hc_byte_perm_S (w[22], w[21], selector);
      w[38] = hc_byte_perm_S (w[21], w[20], selector);
      w[37] = hc_byte_perm_S (w[20], w[19], selector);
      w[36] = hc_byte_perm_S (w[19], w[18], selector);
      w[35] = hc_byte_perm_S (w[18], w[17], selector);
      w[34] = hc_byte_perm_S (w[17], w[16], selector);
      w[33] = hc_byte_perm_S (w[16], w[15], selector);
      w[32] = hc_byte_perm_S (w[15], w[14], selector);
      w[31] = hc_byte_perm_S (w[14], w[13], selector);
      w[30] = hc_byte_perm_S (w[13], w[12], selector);
      w[29] = hc_byte_perm_S (w[12], w[11], selector);
      w[28] = hc_byte_perm_S (w[11], w[10], selector);
      w[27] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[26] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[25] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[24] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[23] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[22] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[21] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[20] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[19] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[18] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[17] = hc_byte_perm_S (w[ 0],     0, selector);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_byte_perm_S (w[45], w[44], selector);
      w[62] = hc_byte_perm_S (w[44], w[43], selector);
      w[61] = hc_byte_perm_S (w[43], w[42], selector);
      w[60] = hc_byte_perm_S (w[42], w[41], selector);
      w[59] = hc_byte_perm_S (w[41], w[40], selector);
      w[58] = hc_byte_perm_S (w[40], w[39], selector);
      w[57] = hc_byte_perm_S (w[39], w[38], selector);
      w[56] = hc_byte_perm_S (w[38], w[37], selector);
      w[55] = hc_byte_perm_S (w[37], w[36], selector);
      w[54] = hc_byte_perm_S (w[36], w[35], selector);
      w[53] = hc_byte_perm_S (w[35], w[34], selector);
      w[52] = hc_byte_perm_S (w[34], w[33], selector);
      w[51] = hc_byte_perm_S (w[33], w[32], selector);
      w[50] = hc_byte_perm_S (w[32], w[31], selector);
      w[49] = hc_byte_perm_S (w[31], w[30], selector);
      w[48] = hc_byte_perm_S (w[30], w[29], selector);
      w[47] = hc_byte_perm_S (w[29], w[28], selector);
      w[46] = hc_byte_perm_S (w[28], w[27], selector);
      w[45] = hc_byte_perm_S (w[27], w[26], selector);
      w[44] = hc_byte_perm_S (w[26], w[25], selector);
      w[43] = hc_byte_perm_S (w[25], w[24], selector);
      w[42] = hc_byte_perm_S (w[24], w[23], selector);
      w[41] = hc_byte_perm_S (w[23], w[22], selector);
      w[40] = hc_byte_perm_S (w[22], w[21], selector);
      w[39] = hc_byte_perm_S (w[21], w[20], selector);
      w[38] = hc_byte_perm_S (w[20], w[19], selector);
      w[37] = hc_byte_perm_S (w[19], w[18], selector);
      w[36] = hc_byte_perm_S (w[18], w[17], selector);
      w[35] = hc_byte_perm_S (w[17], w[16], selector);
      w[34] = hc_byte_perm_S (w[16], w[15], selector);
      w[33] = hc_byte_perm_S (w[15], w[14], selector);
      w[32] = hc_byte_perm_S (w[14], w[13], selector);
      w[31] = hc_byte_perm_S (w[13], w[12], selector);
      w[30] = hc_byte_perm_S (w[12], w[11], selector);
      w[29] = hc_byte_perm_S (w[11], w[10], selector);
      w[28] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[27] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[26] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[25] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[24] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[23] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[22] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[21] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[20] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[19] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[18] = hc_byte_perm_S (w[ 0],     0, selector);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_byte_perm_S (w[44], w[43], selector);
      w[62] = hc_byte_perm_S (w[43], w[42], selector);
      w[61] = hc_byte_perm_S (w[42], w[41], selector);
      w[60] = hc_byte_perm_S (w[41], w[40], selector);
      w[59] = hc_byte_perm_S (w[40], w[39], selector);
      w[58] = hc_byte_perm_S (w[39], w[38], selector);
      w[57] = hc_byte_perm_S (w[38], w[37], selector);
      w[56] = hc_byte_perm_S (w[37], w[36], selector);
      w[55] = hc_byte_perm_S (w[36], w[35], selector);
      w[54] = hc_byte_perm_S (w[35], w[34], selector);
      w[53] = hc_byte_perm_S (w[34], w[33], selector);
      w[52] = hc_byte_perm_S (w[33], w[32], selector);
      w[51] = hc_byte_perm_S (w[32], w[31], selector);
      w[50] = hc_byte_perm_S (w[31], w[30], selector);
      w[49] = hc_byte_perm_S (w[30], w[29], selector);
      w[48] = hc_byte_perm_S (w[29], w[28], selector);
      w[47] = hc_byte_perm_S (w[28], w[27], selector);
      w[46] = hc_byte_perm_S (w[27], w[26], selector);
      w[45] = hc_byte_perm_S (w[26], w[25], selector);
      w[44] = hc_byte_perm_S (w[25], w[24], selector);
      w[43] = hc_byte_perm_S (w[24], w[23], selector);
      w[42] = hc_byte_perm_S (w[23], w[22], selector);
      w[41] = hc_byte_perm_S (w[22], w[21], selector);
      w[40] = hc_byte_perm_S (w[21], w[20], selector);
      w[39] = hc_byte_perm_S (w[20], w[19], selector);
      w[38] = hc_byte_perm_S (w[19], w[18], selector);
      w[37] = hc_byte_perm_S (w[18], w[17], selector);
      w[36] = hc_byte_perm_S (w[17], w[16], selector);
      w[35] = hc_byte_perm_S (w[16], w[15], selector);
      w[34] = hc_byte_perm_S (w[15], w[14], selector);
      w[33] = hc_byte_perm_S (w[14], w[13], selector);
      w[32] = hc_byte_perm_S (w[13], w[12], selector);
      w[31] = hc_byte_perm_S (w[12], w[11], selector);
      w[30] = hc_byte_perm_S (w[11], w[10], selector);
      w[29] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[28] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[27] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[26] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[25] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[24] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[23] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[22] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[21] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[20] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[19] = hc_byte_perm_S (w[ 0],     0, selector);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_byte_perm_S (w[43], w[42], selector);
      w[62] = hc_byte_perm_S (w[42], w[41], selector);
      w[61] = hc_byte_perm_S (w[41], w[40], selector);
      w[60] = hc_byte_perm_S (w[40], w[39], selector);
      w[59] = hc_byte_perm_S (w[39], w[38], selector);
      w[58] = hc_byte_perm_S (w[38], w[37], selector);
      w[57] = hc_byte_perm_S (w[37], w[36], selector);
      w[56] = hc_byte_perm_S (w[36], w[35], selector);
      w[55] = hc_byte_perm_S (w[35], w[34], selector);
      w[54] = hc_byte_perm_S (w[34], w[33], selector);
      w[53] = hc_byte_perm_S (w[33], w[32], selector);
      w[52] = hc_byte_perm_S (w[32], w[31], selector);
      w[51] = hc_byte_perm_S (w[31], w[30], selector);
      w[50] = hc_byte_perm_S (w[30], w[29], selector);
      w[49] = hc_byte_perm_S (w[29], w[28], selector);
      w[48] = hc_byte_perm_S (w[28], w[27], selector);
      w[47] = hc_byte_perm_S (w[27], w[26], selector);
      w[46] = hc_byte_perm_S (w[26], w[25], selector);
      w[45] = hc_byte_perm_S (w[25], w[24], selector);
      w[44] = hc_byte_perm_S (w[24], w[23], selector);
      w[43] = hc_byte_perm_S (w[23], w[22], selector);
      w[42] = hc_byte_perm_S (w[22], w[21], selector);
      w[41] = hc_byte_perm_S (w[21], w[20], selector);
      w[40] = hc_byte_perm_S (w[20], w[19], selector);
      w[39] = hc_byte_perm_S (w[19], w[18], selector);
      w[38] = hc_byte_perm_S (w[18], w[17], selector);
      w[37] = hc_byte_perm_S (w[17], w[16], selector);
      w[36] = hc_byte_perm_S (w[16], w[15], selector);
      w[35] = hc_byte_perm_S (w[15], w[14], selector);
      w[34] = hc_byte_perm_S (w[14], w[13], selector);
      w[33] = hc_byte_perm_S (w[13], w[12], selector);
      w[32] = hc_byte_perm_S (w[12], w[11], selector);
      w[31] = hc_byte_perm_S (w[11], w[10], selector);
      w[30] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[29] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[28] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[27] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[26] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[25] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[24] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[23] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[22] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[21] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[20] = hc_byte_perm_S (w[ 0],     0, selector);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_byte_perm_S (w[42], w[41], selector);
      w[62] = hc_byte_perm_S (w[41], w[40], selector);
      w[61] = hc_byte_perm_S (w[40], w[39], selector);
      w[60] = hc_byte_perm_S (w[39], w[38], selector);
      w[59] = hc_byte_perm_S (w[38], w[37], selector);
      w[58] = hc_byte_perm_S (w[37], w[36], selector);
      w[57] = hc_byte_perm_S (w[36], w[35], selector);
      w[56] = hc_byte_perm_S (w[35], w[34], selector);
      w[55] = hc_byte_perm_S (w[34], w[33], selector);
      w[54] = hc_byte_perm_S (w[33], w[32], selector);
      w[53] = hc_byte_perm_S (w[32], w[31], selector);
      w[52] = hc_byte_perm_S (w[31], w[30], selector);
      w[51] = hc_byte_perm_S (w[30], w[29], selector);
      w[50] = hc_byte_perm_S (w[29], w[28], selector);
      w[49] = hc_byte_perm_S (w[28], w[27], selector);
      w[48] = hc_byte_perm_S (w[27], w[26], selector);
      w[47] = hc_byte_perm_S (w[26], w[25], selector);
      w[46] = hc_byte_perm_S (w[25], w[24], selector);
      w[45] = hc_byte_perm_S (w[24], w[23], selector);
      w[44] = hc_byte_perm_S (w[23], w[22], selector);
      w[43] = hc_byte_perm_S (w[22], w[21], selector);
      w[42] = hc_byte_perm_S (w[21], w[20], selector);
      w[41] = hc_byte_perm_S (w[20], w[19], selector);
      w[40] = hc_byte_perm_S (w[19], w[18], selector);
      w[39] = hc_byte_perm_S (w[18], w[17], selector);
      w[38] = hc_byte_perm_S (w[17], w[16], selector);
      w[37] = hc_byte_perm_S (w[16], w[15], selector);
      w[36] = hc_byte_perm_S (w[15], w[14], selector);
      w[35] = hc_byte_perm_S (w[14], w[13], selector);
      w[34] = hc_byte_perm_S (w[13], w[12], selector);
      w[33] = hc_byte_perm_S (w[12], w[11], selector);
      w[32] = hc_byte_perm_S (w[11], w[10], selector);
      w[31] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[30] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[29] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[28] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[27] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[26] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[25] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[24] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[23] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[22] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[21] = hc_byte_perm_S (w[ 0],     0, selector);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_byte_perm_S (w[41], w[40], selector);
      w[62] = hc_byte_perm_S (w[40], w[39], selector);
      w[61] = hc_byte_perm_S (w[39], w[38], selector);
      w[60] = hc_byte_perm_S (w[38], w[37], selector);
      w[59] = hc_byte_perm_S (w[37], w[36], selector);
      w[58] = hc_byte_perm_S (w[36], w[35], selector);
      w[57] = hc_byte_perm_S (w[35], w[34], selector);
      w[56] = hc_byte_perm_S (w[34], w[33], selector);
      w[55] = hc_byte_perm_S (w[33], w[32], selector);
      w[54] = hc_byte_perm_S (w[32], w[31], selector);
      w[53] = hc_byte_perm_S (w[31], w[30], selector);
      w[52] = hc_byte_perm_S (w[30], w[29], selector);
      w[51] = hc_byte_perm_S (w[29], w[28], selector);
      w[50] = hc_byte_perm_S (w[28], w[27], selector);
      w[49] = hc_byte_perm_S (w[27], w[26], selector);
      w[48] = hc_byte_perm_S (w[26], w[25], selector);
      w[47] = hc_byte_perm_S (w[25], w[24], selector);
      w[46] = hc_byte_perm_S (w[24], w[23], selector);
      w[45] = hc_byte_perm_S (w[23], w[22], selector);
      w[44] = hc_byte_perm_S (w[22], w[21], selector);
      w[43] = hc_byte_perm_S (w[21], w[20], selector);
      w[42] = hc_byte_perm_S (w[20], w[19], selector);
      w[41] = hc_byte_perm_S (w[19], w[18], selector);
      w[40] = hc_byte_perm_S (w[18], w[17], selector);
      w[39] = hc_byte_perm_S (w[17], w[16], selector);
      w[38] = hc_byte_perm_S (w[16], w[15], selector);
      w[37] = hc_byte_perm_S (w[15], w[14], selector);
      w[36] = hc_byte_perm_S (w[14], w[13], selector);
      w[35] = hc_byte_perm_S (w[13], w[12], selector);
      w[34] = hc_byte_perm_S (w[12], w[11], selector);
      w[33] = hc_byte_perm_S (w[11], w[10], selector);
      w[32] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[31] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[30] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[29] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[28] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[27] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[26] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[25] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[24] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[23] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[22] = hc_byte_perm_S (w[ 0],     0, selector);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_byte_perm_S (w[40], w[39], selector);
      w[62] = hc_byte_perm_S (w[39], w[38], selector);
      w[61] = hc_byte_perm_S (w[38], w[37], selector);
      w[60] = hc_byte_perm_S (w[37], w[36], selector);
      w[59] = hc_byte_perm_S (w[36], w[35], selector);
      w[58] = hc_byte_perm_S (w[35], w[34], selector);
      w[57] = hc_byte_perm_S (w[34], w[33], selector);
      w[56] = hc_byte_perm_S (w[33], w[32], selector);
      w[55] = hc_byte_perm_S (w[32], w[31], selector);
      w[54] = hc_byte_perm_S (w[31], w[30], selector);
      w[53] = hc_byte_perm_S (w[30], w[29], selector);
      w[52] = hc_byte_perm_S (w[29], w[28], selector);
      w[51] = hc_byte_perm_S (w[28], w[27], selector);
      w[50] = hc_byte_perm_S (w[27], w[26], selector);
      w[49] = hc_byte_perm_S (w[26], w[25], selector);
      w[48] = hc_byte_perm_S (w[25], w[24], selector);
      w[47] = hc_byte_perm_S (w[24], w[23], selector);
      w[46] = hc_byte_perm_S (w[23], w[22], selector);
      w[45] = hc_byte_perm_S (w[22], w[21], selector);
      w[44] = hc_byte_perm_S (w[21], w[20], selector);
      w[43] = hc_byte_perm_S (w[20], w[19], selector);
      w[42] = hc_byte_perm_S (w[19], w[18], selector);
      w[41] = hc_byte_perm_S (w[18], w[17], selector);
      w[40] = hc_byte_perm_S (w[17], w[16], selector);
      w[39] = hc_byte_perm_S (w[16], w[15], selector);
      w[38] = hc_byte_perm_S (w[15], w[14], selector);
      w[37] = hc_byte_perm_S (w[14], w[13], selector);
      w[36] = hc_byte_perm_S (w[13], w[12], selector);
      w[35] = hc_byte_perm_S (w[12], w[11], selector);
      w[34] = hc_byte_perm_S (w[11], w[10], selector);
      w[33] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[32] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[31] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[30] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[29] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[28] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[27] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[26] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[25] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[24] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[23] = hc_byte_perm_S (w[ 0],     0, selector);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_byte_perm_S (w[39], w[38], selector);
      w[62] = hc_byte_perm_S (w[38], w[37], selector);
      w[61] = hc_byte_perm_S (w[37], w[36], selector);
      w[60] = hc_byte_perm_S (w[36], w[35], selector);
      w[59] = hc_byte_perm_S (w[35], w[34], selector);
      w[58] = hc_byte_perm_S (w[34], w[33], selector);
      w[57] = hc_byte_perm_S (w[33], w[32], selector);
      w[56] = hc_byte_perm_S (w[32], w[31], selector);
      w[55] = hc_byte_perm_S (w[31], w[30], selector);
      w[54] = hc_byte_perm_S (w[30], w[29], selector);
      w[53] = hc_byte_perm_S (w[29], w[28], selector);
      w[52] = hc_byte_perm_S (w[28], w[27], selector);
      w[51] = hc_byte_perm_S (w[27], w[26], selector);
      w[50] = hc_byte_perm_S (w[26], w[25], selector);
      w[49] = hc_byte_perm_S (w[25], w[24], selector);
      w[48] = hc_byte_perm_S (w[24], w[23], selector);
      w[47] = hc_byte_perm_S (w[23], w[22], selector);
      w[46] = hc_byte_perm_S (w[22], w[21], selector);
      w[45] = hc_byte_perm_S (w[21], w[20], selector);
      w[44] = hc_byte_perm_S (w[20], w[19], selector);
      w[43] = hc_byte_perm_S (w[19], w[18], selector);
      w[42] = hc_byte_perm_S (w[18], w[17], selector);
      w[41] = hc_byte_perm_S (w[17], w[16], selector);
      w[40] = hc_byte_perm_S (w[16], w[15], selector);
      w[39] = hc_byte_perm_S (w[15], w[14], selector);
      w[38] = hc_byte_perm_S (w[14], w[13], selector);
      w[37] = hc_byte_perm_S (w[13], w[12], selector);
      w[36] = hc_byte_perm_S (w[12], w[11], selector);
      w[35] = hc_byte_perm_S (w[11], w[10], selector);
      w[34] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[33] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[32] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[31] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[30] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[29] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[28] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[27] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[26] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[25] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[24] = hc_byte_perm_S (w[ 0],     0, selector);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_byte_perm_S (w[38], w[37], selector);
      w[62] = hc_byte_perm_S (w[37], w[36], selector);
      w[61] = hc_byte_perm_S (w[36], w[35], selector);
      w[60] = hc_byte_perm_S (w[35], w[34], selector);
      w[59] = hc_byte_perm_S (w[34], w[33], selector);
      w[58] = hc_byte_perm_S (w[33], w[32], selector);
      w[57] = hc_byte_perm_S (w[32], w[31], selector);
      w[56] = hc_byte_perm_S (w[31], w[30], selector);
      w[55] = hc_byte_perm_S (w[30], w[29], selector);
      w[54] = hc_byte_perm_S (w[29], w[28], selector);
      w[53] = hc_byte_perm_S (w[28], w[27], selector);
      w[52] = hc_byte_perm_S (w[27], w[26], selector);
      w[51] = hc_byte_perm_S (w[26], w[25], selector);
      w[50] = hc_byte_perm_S (w[25], w[24], selector);
      w[49] = hc_byte_perm_S (w[24], w[23], selector);
      w[48] = hc_byte_perm_S (w[23], w[22], selector);
      w[47] = hc_byte_perm_S (w[22], w[21], selector);
      w[46] = hc_byte_perm_S (w[21], w[20], selector);
      w[45] = hc_byte_perm_S (w[20], w[19], selector);
      w[44] = hc_byte_perm_S (w[19], w[18], selector);
      w[43] = hc_byte_perm_S (w[18], w[17], selector);
      w[42] = hc_byte_perm_S (w[17], w[16], selector);
      w[41] = hc_byte_perm_S (w[16], w[15], selector);
      w[40] = hc_byte_perm_S (w[15], w[14], selector);
      w[39] = hc_byte_perm_S (w[14], w[13], selector);
      w[38] = hc_byte_perm_S (w[13], w[12], selector);
      w[37] = hc_byte_perm_S (w[12], w[11], selector);
      w[36] = hc_byte_perm_S (w[11], w[10], selector);
      w[35] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[34] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[33] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[32] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[31] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[30] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[29] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[28] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[27] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[26] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[25] = hc_byte_perm_S (w[ 0],     0, selector);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_byte_perm_S (w[37], w[36], selector);
      w[62] = hc_byte_perm_S (w[36], w[35], selector);
      w[61] = hc_byte_perm_S (w[35], w[34], selector);
      w[60] = hc_byte_perm_S (w[34], w[33], selector);
      w[59] = hc_byte_perm_S (w[33], w[32], selector);
      w[58] = hc_byte_perm_S (w[32], w[31], selector);
      w[57] = hc_byte_perm_S (w[31], w[30], selector);
      w[56] = hc_byte_perm_S (w[30], w[29], selector);
      w[55] = hc_byte_perm_S (w[29], w[28], selector);
      w[54] = hc_byte_perm_S (w[28], w[27], selector);
      w[53] = hc_byte_perm_S (w[27], w[26], selector);
      w[52] = hc_byte_perm_S (w[26], w[25], selector);
      w[51] = hc_byte_perm_S (w[25], w[24], selector);
      w[50] = hc_byte_perm_S (w[24], w[23], selector);
      w[49] = hc_byte_perm_S (w[23], w[22], selector);
      w[48] = hc_byte_perm_S (w[22], w[21], selector);
      w[47] = hc_byte_perm_S (w[21], w[20], selector);
      w[46] = hc_byte_perm_S (w[20], w[19], selector);
      w[45] = hc_byte_perm_S (w[19], w[18], selector);
      w[44] = hc_byte_perm_S (w[18], w[17], selector);
      w[43] = hc_byte_perm_S (w[17], w[16], selector);
      w[42] = hc_byte_perm_S (w[16], w[15], selector);
      w[41] = hc_byte_perm_S (w[15], w[14], selector);
      w[40] = hc_byte_perm_S (w[14], w[13], selector);
      w[39] = hc_byte_perm_S (w[13], w[12], selector);
      w[38] = hc_byte_perm_S (w[12], w[11], selector);
      w[37] = hc_byte_perm_S (w[11], w[10], selector);
      w[36] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[35] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[34] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[33] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[32] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[31] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[30] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[29] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[28] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[27] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[26] = hc_byte_perm_S (w[ 0],     0, selector);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_byte_perm_S (w[36], w[35], selector);
      w[62] = hc_byte_perm_S (w[35], w[34], selector);
      w[61] = hc_byte_perm_S (w[34], w[33], selector);
      w[60] = hc_byte_perm_S (w[33], w[32], selector);
      w[59] = hc_byte_perm_S (w[32], w[31], selector);
      w[58] = hc_byte_perm_S (w[31], w[30], selector);
      w[57] = hc_byte_perm_S (w[30], w[29], selector);
      w[56] = hc_byte_perm_S (w[29], w[28], selector);
      w[55] = hc_byte_perm_S (w[28], w[27], selector);
      w[54] = hc_byte_perm_S (w[27], w[26], selector);
      w[53] = hc_byte_perm_S (w[26], w[25], selector);
      w[52] = hc_byte_perm_S (w[25], w[24], selector);
      w[51] = hc_byte_perm_S (w[24], w[23], selector);
      w[50] = hc_byte_perm_S (w[23], w[22], selector);
      w[49] = hc_byte_perm_S (w[22], w[21], selector);
      w[48] = hc_byte_perm_S (w[21], w[20], selector);
      w[47] = hc_byte_perm_S (w[20], w[19], selector);
      w[46] = hc_byte_perm_S (w[19], w[18], selector);
      w[45] = hc_byte_perm_S (w[18], w[17], selector);
      w[44] = hc_byte_perm_S (w[17], w[16], selector);
      w[43] = hc_byte_perm_S (w[16], w[15], selector);
      w[42] = hc_byte_perm_S (w[15], w[14], selector);
      w[41] = hc_byte_perm_S (w[14], w[13], selector);
      w[40] = hc_byte_perm_S (w[13], w[12], selector);
      w[39] = hc_byte_perm_S (w[12], w[11], selector);
      w[38] = hc_byte_perm_S (w[11], w[10], selector);
      w[37] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[36] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[35] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[34] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[33] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[32] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[31] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[30] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[29] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[28] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[27] = hc_byte_perm_S (w[ 0],     0, selector);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_byte_perm_S (w[35], w[34], selector);
      w[62] = hc_byte_perm_S (w[34], w[33], selector);
      w[61] = hc_byte_perm_S (w[33], w[32], selector);
      w[60] = hc_byte_perm_S (w[32], w[31], selector);
      w[59] = hc_byte_perm_S (w[31], w[30], selector);
      w[58] = hc_byte_perm_S (w[30], w[29], selector);
      w[57] = hc_byte_perm_S (w[29], w[28], selector);
      w[56] = hc_byte_perm_S (w[28], w[27], selector);
      w[55] = hc_byte_perm_S (w[27], w[26], selector);
      w[54] = hc_byte_perm_S (w[26], w[25], selector);
      w[53] = hc_byte_perm_S (w[25], w[24], selector);
      w[52] = hc_byte_perm_S (w[24], w[23], selector);
      w[51] = hc_byte_perm_S (w[23], w[22], selector);
      w[50] = hc_byte_perm_S (w[22], w[21], selector);
      w[49] = hc_byte_perm_S (w[21], w[20], selector);
      w[48] = hc_byte_perm_S (w[20], w[19], selector);
      w[47] = hc_byte_perm_S (w[19], w[18], selector);
      w[46] = hc_byte_perm_S (w[18], w[17], selector);
      w[45] = hc_byte_perm_S (w[17], w[16], selector);
      w[44] = hc_byte_perm_S (w[16], w[15], selector);
      w[43] = hc_byte_perm_S (w[15], w[14], selector);
      w[42] = hc_byte_perm_S (w[14], w[13], selector);
      w[41] = hc_byte_perm_S (w[13], w[12], selector);
      w[40] = hc_byte_perm_S (w[12], w[11], selector);
      w[39] = hc_byte_perm_S (w[11], w[10], selector);
      w[38] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[37] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[36] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[35] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[34] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[33] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[32] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[31] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[30] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[29] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[28] = hc_byte_perm_S (w[ 0],     0, selector);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_byte_perm_S (w[34], w[33], selector);
      w[62] = hc_byte_perm_S (w[33], w[32], selector);
      w[61] = hc_byte_perm_S (w[32], w[31], selector);
      w[60] = hc_byte_perm_S (w[31], w[30], selector);
      w[59] = hc_byte_perm_S (w[30], w[29], selector);
      w[58] = hc_byte_perm_S (w[29], w[28], selector);
      w[57] = hc_byte_perm_S (w[28], w[27], selector);
      w[56] = hc_byte_perm_S (w[27], w[26], selector);
      w[55] = hc_byte_perm_S (w[26], w[25], selector);
      w[54] = hc_byte_perm_S (w[25], w[24], selector);
      w[53] = hc_byte_perm_S (w[24], w[23], selector);
      w[52] = hc_byte_perm_S (w[23], w[22], selector);
      w[51] = hc_byte_perm_S (w[22], w[21], selector);
      w[50] = hc_byte_perm_S (w[21], w[20], selector);
      w[49] = hc_byte_perm_S (w[20], w[19], selector);
      w[48] = hc_byte_perm_S (w[19], w[18], selector);
      w[47] = hc_byte_perm_S (w[18], w[17], selector);
      w[46] = hc_byte_perm_S (w[17], w[16], selector);
      w[45] = hc_byte_perm_S (w[16], w[15], selector);
      w[44] = hc_byte_perm_S (w[15], w[14], selector);
      w[43] = hc_byte_perm_S (w[14], w[13], selector);
      w[42] = hc_byte_perm_S (w[13], w[12], selector);
      w[41] = hc_byte_perm_S (w[12], w[11], selector);
      w[40] = hc_byte_perm_S (w[11], w[10], selector);
      w[39] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[38] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[37] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[36] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[35] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[34] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[33] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[32] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[31] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[30] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[29] = hc_byte_perm_S (w[ 0],     0, selector);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_byte_perm_S (w[33], w[32], selector);
      w[62] = hc_byte_perm_S (w[32], w[31], selector);
      w[61] = hc_byte_perm_S (w[31], w[30], selector);
      w[60] = hc_byte_perm_S (w[30], w[29], selector);
      w[59] = hc_byte_perm_S (w[29], w[28], selector);
      w[58] = hc_byte_perm_S (w[28], w[27], selector);
      w[57] = hc_byte_perm_S (w[27], w[26], selector);
      w[56] = hc_byte_perm_S (w[26], w[25], selector);
      w[55] = hc_byte_perm_S (w[25], w[24], selector);
      w[54] = hc_byte_perm_S (w[24], w[23], selector);
      w[53] = hc_byte_perm_S (w[23], w[22], selector);
      w[52] = hc_byte_perm_S (w[22], w[21], selector);
      w[51] = hc_byte_perm_S (w[21], w[20], selector);
      w[50] = hc_byte_perm_S (w[20], w[19], selector);
      w[49] = hc_byte_perm_S (w[19], w[18], selector);
      w[48] = hc_byte_perm_S (w[18], w[17], selector);
      w[47] = hc_byte_perm_S (w[17], w[16], selector);
      w[46] = hc_byte_perm_S (w[16], w[15], selector);
      w[45] = hc_byte_perm_S (w[15], w[14], selector);
      w[44] = hc_byte_perm_S (w[14], w[13], selector);
      w[43] = hc_byte_perm_S (w[13], w[12], selector);
      w[42] = hc_byte_perm_S (w[12], w[11], selector);
      w[41] = hc_byte_perm_S (w[11], w[10], selector);
      w[40] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[39] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[38] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[37] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[36] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[35] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[34] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[33] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[32] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[31] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[30] = hc_byte_perm_S (w[ 0],     0, selector);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_byte_perm_S (w[32], w[31], selector);
      w[62] = hc_byte_perm_S (w[31], w[30], selector);
      w[61] = hc_byte_perm_S (w[30], w[29], selector);
      w[60] = hc_byte_perm_S (w[29], w[28], selector);
      w[59] = hc_byte_perm_S (w[28], w[27], selector);
      w[58] = hc_byte_perm_S (w[27], w[26], selector);
      w[57] = hc_byte_perm_S (w[26], w[25], selector);
      w[56] = hc_byte_perm_S (w[25], w[24], selector);
      w[55] = hc_byte_perm_S (w[24], w[23], selector);
      w[54] = hc_byte_perm_S (w[23], w[22], selector);
      w[53] = hc_byte_perm_S (w[22], w[21], selector);
      w[52] = hc_byte_perm_S (w[21], w[20], selector);
      w[51] = hc_byte_perm_S (w[20], w[19], selector);
      w[50] = hc_byte_perm_S (w[19], w[18], selector);
      w[49] = hc_byte_perm_S (w[18], w[17], selector);
      w[48] = hc_byte_perm_S (w[17], w[16], selector);
      w[47] = hc_byte_perm_S (w[16], w[15], selector);
      w[46] = hc_byte_perm_S (w[15], w[14], selector);
      w[45] = hc_byte_perm_S (w[14], w[13], selector);
      w[44] = hc_byte_perm_S (w[13], w[12], selector);
      w[43] = hc_byte_perm_S (w[12], w[11], selector);
      w[42] = hc_byte_perm_S (w[11], w[10], selector);
      w[41] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[40] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[39] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[38] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[37] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[36] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[35] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[34] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[33] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[32] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[31] = hc_byte_perm_S (w[ 0],     0, selector);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_byte_perm_S (w[31], w[30], selector);
      w[62] = hc_byte_perm_S (w[30], w[29], selector);
      w[61] = hc_byte_perm_S (w[29], w[28], selector);
      w[60] = hc_byte_perm_S (w[28], w[27], selector);
      w[59] = hc_byte_perm_S (w[27], w[26], selector);
      w[58] = hc_byte_perm_S (w[26], w[25], selector);
      w[57] = hc_byte_perm_S (w[25], w[24], selector);
      w[56] = hc_byte_perm_S (w[24], w[23], selector);
      w[55] = hc_byte_perm_S (w[23], w[22], selector);
      w[54] = hc_byte_perm_S (w[22], w[21], selector);
      w[53] = hc_byte_perm_S (w[21], w[20], selector);
      w[52] = hc_byte_perm_S (w[20], w[19], selector);
      w[51] = hc_byte_perm_S (w[19], w[18], selector);
      w[50] = hc_byte_perm_S (w[18], w[17], selector);
      w[49] = hc_byte_perm_S (w[17], w[16], selector);
      w[48] = hc_byte_perm_S (w[16], w[15], selector);
      w[47] = hc_byte_perm_S (w[15], w[14], selector);
      w[46] = hc_byte_perm_S (w[14], w[13], selector);
      w[45] = hc_byte_perm_S (w[13], w[12], selector);
      w[44] = hc_byte_perm_S (w[12], w[11], selector);
      w[43] = hc_byte_perm_S (w[11], w[10], selector);
      w[42] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[41] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[40] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[39] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[38] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[37] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[36] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[35] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[34] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[33] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[32] = hc_byte_perm_S (w[ 0],     0, selector);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_byte_perm_S (w[30], w[29], selector);
      w[62] = hc_byte_perm_S (w[29], w[28], selector);
      w[61] = hc_byte_perm_S (w[28], w[27], selector);
      w[60] = hc_byte_perm_S (w[27], w[26], selector);
      w[59] = hc_byte_perm_S (w[26], w[25], selector);
      w[58] = hc_byte_perm_S (w[25], w[24], selector);
      w[57] = hc_byte_perm_S (w[24], w[23], selector);
      w[56] = hc_byte_perm_S (w[23], w[22], selector);
      w[55] = hc_byte_perm_S (w[22], w[21], selector);
      w[54] = hc_byte_perm_S (w[21], w[20], selector);
      w[53] = hc_byte_perm_S (w[20], w[19], selector);
      w[52] = hc_byte_perm_S (w[19], w[18], selector);
      w[51] = hc_byte_perm_S (w[18], w[17], selector);
      w[50] = hc_byte_perm_S (w[17], w[16], selector);
      w[49] = hc_byte_perm_S (w[16], w[15], selector);
      w[48] = hc_byte_perm_S (w[15], w[14], selector);
      w[47] = hc_byte_perm_S (w[14], w[13], selector);
      w[46] = hc_byte_perm_S (w[13], w[12], selector);
      w[45] = hc_byte_perm_S (w[12], w[11], selector);
      w[44] = hc_byte_perm_S (w[11], w[10], selector);
      w[43] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[42] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[41] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[40] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[39] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[38] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[37] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[36] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[35] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[34] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[33] = hc_byte_perm_S (w[ 0],     0, selector);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_byte_perm_S (w[29], w[28], selector);
      w[62] = hc_byte_perm_S (w[28], w[27], selector);
      w[61] = hc_byte_perm_S (w[27], w[26], selector);
      w[60] = hc_byte_perm_S (w[26], w[25], selector);
      w[59] = hc_byte_perm_S (w[25], w[24], selector);
      w[58] = hc_byte_perm_S (w[24], w[23], selector);
      w[57] = hc_byte_perm_S (w[23], w[22], selector);
      w[56] = hc_byte_perm_S (w[22], w[21], selector);
      w[55] = hc_byte_perm_S (w[21], w[20], selector);
      w[54] = hc_byte_perm_S (w[20], w[19], selector);
      w[53] = hc_byte_perm_S (w[19], w[18], selector);
      w[52] = hc_byte_perm_S (w[18], w[17], selector);
      w[51] = hc_byte_perm_S (w[17], w[16], selector);
      w[50] = hc_byte_perm_S (w[16], w[15], selector);
      w[49] = hc_byte_perm_S (w[15], w[14], selector);
      w[48] = hc_byte_perm_S (w[14], w[13], selector);
      w[47] = hc_byte_perm_S (w[13], w[12], selector);
      w[46] = hc_byte_perm_S (w[12], w[11], selector);
      w[45] = hc_byte_perm_S (w[11], w[10], selector);
      w[44] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[43] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[42] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[41] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[40] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[39] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[38] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[37] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[36] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[35] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[34] = hc_byte_perm_S (w[ 0],     0, selector);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_byte_perm_S (w[28], w[27], selector);
      w[62] = hc_byte_perm_S (w[27], w[26], selector);
      w[61] = hc_byte_perm_S (w[26], w[25], selector);
      w[60] = hc_byte_perm_S (w[25], w[24], selector);
      w[59] = hc_byte_perm_S (w[24], w[23], selector);
      w[58] = hc_byte_perm_S (w[23], w[22], selector);
      w[57] = hc_byte_perm_S (w[22], w[21], selector);
      w[56] = hc_byte_perm_S (w[21], w[20], selector);
      w[55] = hc_byte_perm_S (w[20], w[19], selector);
      w[54] = hc_byte_perm_S (w[19], w[18], selector);
      w[53] = hc_byte_perm_S (w[18], w[17], selector);
      w[52] = hc_byte_perm_S (w[17], w[16], selector);
      w[51] = hc_byte_perm_S (w[16], w[15], selector);
      w[50] = hc_byte_perm_S (w[15], w[14], selector);
      w[49] = hc_byte_perm_S (w[14], w[13], selector);
      w[48] = hc_byte_perm_S (w[13], w[12], selector);
      w[47] = hc_byte_perm_S (w[12], w[11], selector);
      w[46] = hc_byte_perm_S (w[11], w[10], selector);
      w[45] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[44] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[43] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[42] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[41] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[40] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[39] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[38] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[37] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[36] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[35] = hc_byte_perm_S (w[ 0],     0, selector);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_byte_perm_S (w[27], w[26], selector);
      w[62] = hc_byte_perm_S (w[26], w[25], selector);
      w[61] = hc_byte_perm_S (w[25], w[24], selector);
      w[60] = hc_byte_perm_S (w[24], w[23], selector);
      w[59] = hc_byte_perm_S (w[23], w[22], selector);
      w[58] = hc_byte_perm_S (w[22], w[21], selector);
      w[57] = hc_byte_perm_S (w[21], w[20], selector);
      w[56] = hc_byte_perm_S (w[20], w[19], selector);
      w[55] = hc_byte_perm_S (w[19], w[18], selector);
      w[54] = hc_byte_perm_S (w[18], w[17], selector);
      w[53] = hc_byte_perm_S (w[17], w[16], selector);
      w[52] = hc_byte_perm_S (w[16], w[15], selector);
      w[51] = hc_byte_perm_S (w[15], w[14], selector);
      w[50] = hc_byte_perm_S (w[14], w[13], selector);
      w[49] = hc_byte_perm_S (w[13], w[12], selector);
      w[48] = hc_byte_perm_S (w[12], w[11], selector);
      w[47] = hc_byte_perm_S (w[11], w[10], selector);
      w[46] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[45] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[44] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[43] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[42] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[41] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[40] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[39] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[38] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[37] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[36] = hc_byte_perm_S (w[ 0],     0, selector);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_byte_perm_S (w[26], w[25], selector);
      w[62] = hc_byte_perm_S (w[25], w[24], selector);
      w[61] = hc_byte_perm_S (w[24], w[23], selector);
      w[60] = hc_byte_perm_S (w[23], w[22], selector);
      w[59] = hc_byte_perm_S (w[22], w[21], selector);
      w[58] = hc_byte_perm_S (w[21], w[20], selector);
      w[57] = hc_byte_perm_S (w[20], w[19], selector);
      w[56] = hc_byte_perm_S (w[19], w[18], selector);
      w[55] = hc_byte_perm_S (w[18], w[17], selector);
      w[54] = hc_byte_perm_S (w[17], w[16], selector);
      w[53] = hc_byte_perm_S (w[16], w[15], selector);
      w[52] = hc_byte_perm_S (w[15], w[14], selector);
      w[51] = hc_byte_perm_S (w[14], w[13], selector);
      w[50] = hc_byte_perm_S (w[13], w[12], selector);
      w[49] = hc_byte_perm_S (w[12], w[11], selector);
      w[48] = hc_byte_perm_S (w[11], w[10], selector);
      w[47] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[46] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[45] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[44] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[43] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[42] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[41] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[40] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[39] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[38] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[37] = hc_byte_perm_S (w[ 0],     0, selector);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_byte_perm_S (w[25], w[24], selector);
      w[62] = hc_byte_perm_S (w[24], w[23], selector);
      w[61] = hc_byte_perm_S (w[23], w[22], selector);
      w[60] = hc_byte_perm_S (w[22], w[21], selector);
      w[59] = hc_byte_perm_S (w[21], w[20], selector);
      w[58] = hc_byte_perm_S (w[20], w[19], selector);
      w[57] = hc_byte_perm_S (w[19], w[18], selector);
      w[56] = hc_byte_perm_S (w[18], w[17], selector);
      w[55] = hc_byte_perm_S (w[17], w[16], selector);
      w[54] = hc_byte_perm_S (w[16], w[15], selector);
      w[53] = hc_byte_perm_S (w[15], w[14], selector);
      w[52] = hc_byte_perm_S (w[14], w[13], selector);
      w[51] = hc_byte_perm_S (w[13], w[12], selector);
      w[50] = hc_byte_perm_S (w[12], w[11], selector);
      w[49] = hc_byte_perm_S (w[11], w[10], selector);
      w[48] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[47] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[46] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[45] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[44] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[43] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[42] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[41] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[40] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[39] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[38] = hc_byte_perm_S (w[ 0],     0, selector);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_byte_perm_S (w[24], w[23], selector);
      w[62] = hc_byte_perm_S (w[23], w[22], selector);
      w[61] = hc_byte_perm_S (w[22], w[21], selector);
      w[60] = hc_byte_perm_S (w[21], w[20], selector);
      w[59] = hc_byte_perm_S (w[20], w[19], selector);
      w[58] = hc_byte_perm_S (w[19], w[18], selector);
      w[57] = hc_byte_perm_S (w[18], w[17], selector);
      w[56] = hc_byte_perm_S (w[17], w[16], selector);
      w[55] = hc_byte_perm_S (w[16], w[15], selector);
      w[54] = hc_byte_perm_S (w[15], w[14], selector);
      w[53] = hc_byte_perm_S (w[14], w[13], selector);
      w[52] = hc_byte_perm_S (w[13], w[12], selector);
      w[51] = hc_byte_perm_S (w[12], w[11], selector);
      w[50] = hc_byte_perm_S (w[11], w[10], selector);
      w[49] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[48] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[47] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[46] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[45] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[44] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[43] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[42] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[41] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[40] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[39] = hc_byte_perm_S (w[ 0],     0, selector);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_byte_perm_S (w[23], w[22], selector);
      w[62] = hc_byte_perm_S (w[22], w[21], selector);
      w[61] = hc_byte_perm_S (w[21], w[20], selector);
      w[60] = hc_byte_perm_S (w[20], w[19], selector);
      w[59] = hc_byte_perm_S (w[19], w[18], selector);
      w[58] = hc_byte_perm_S (w[18], w[17], selector);
      w[57] = hc_byte_perm_S (w[17], w[16], selector);
      w[56] = hc_byte_perm_S (w[16], w[15], selector);
      w[55] = hc_byte_perm_S (w[15], w[14], selector);
      w[54] = hc_byte_perm_S (w[14], w[13], selector);
      w[53] = hc_byte_perm_S (w[13], w[12], selector);
      w[52] = hc_byte_perm_S (w[12], w[11], selector);
      w[51] = hc_byte_perm_S (w[11], w[10], selector);
      w[50] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[49] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[48] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[47] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[46] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[45] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[44] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[43] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[42] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[41] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[40] = hc_byte_perm_S (w[ 0],     0, selector);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_byte_perm_S (w[22], w[21], selector);
      w[62] = hc_byte_perm_S (w[21], w[20], selector);
      w[61] = hc_byte_perm_S (w[20], w[19], selector);
      w[60] = hc_byte_perm_S (w[19], w[18], selector);
      w[59] = hc_byte_perm_S (w[18], w[17], selector);
      w[58] = hc_byte_perm_S (w[17], w[16], selector);
      w[57] = hc_byte_perm_S (w[16], w[15], selector);
      w[56] = hc_byte_perm_S (w[15], w[14], selector);
      w[55] = hc_byte_perm_S (w[14], w[13], selector);
      w[54] = hc_byte_perm_S (w[13], w[12], selector);
      w[53] = hc_byte_perm_S (w[12], w[11], selector);
      w[52] = hc_byte_perm_S (w[11], w[10], selector);
      w[51] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[50] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[49] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[48] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[47] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[46] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[45] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[44] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[43] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[42] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[41] = hc_byte_perm_S (w[ 0],     0, selector);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_byte_perm_S (w[21], w[20], selector);
      w[62] = hc_byte_perm_S (w[20], w[19], selector);
      w[61] = hc_byte_perm_S (w[19], w[18], selector);
      w[60] = hc_byte_perm_S (w[18], w[17], selector);
      w[59] = hc_byte_perm_S (w[17], w[16], selector);
      w[58] = hc_byte_perm_S (w[16], w[15], selector);
      w[57] = hc_byte_perm_S (w[15], w[14], selector);
      w[56] = hc_byte_perm_S (w[14], w[13], selector);
      w[55] = hc_byte_perm_S (w[13], w[12], selector);
      w[54] = hc_byte_perm_S (w[12], w[11], selector);
      w[53] = hc_byte_perm_S (w[11], w[10], selector);
      w[52] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[51] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[50] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[49] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[48] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[47] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[46] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[45] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[44] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[43] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[42] = hc_byte_perm_S (w[ 0],     0, selector);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_byte_perm_S (w[20], w[19], selector);
      w[62] = hc_byte_perm_S (w[19], w[18], selector);
      w[61] = hc_byte_perm_S (w[18], w[17], selector);
      w[60] = hc_byte_perm_S (w[17], w[16], selector);
      w[59] = hc_byte_perm_S (w[16], w[15], selector);
      w[58] = hc_byte_perm_S (w[15], w[14], selector);
      w[57] = hc_byte_perm_S (w[14], w[13], selector);
      w[56] = hc_byte_perm_S (w[13], w[12], selector);
      w[55] = hc_byte_perm_S (w[12], w[11], selector);
      w[54] = hc_byte_perm_S (w[11], w[10], selector);
      w[53] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[52] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[51] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[50] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[49] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[48] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[47] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[46] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[45] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[44] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[43] = hc_byte_perm_S (w[ 0],     0, selector);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_byte_perm_S (w[19], w[18], selector);
      w[62] = hc_byte_perm_S (w[18], w[17], selector);
      w[61] = hc_byte_perm_S (w[17], w[16], selector);
      w[60] = hc_byte_perm_S (w[16], w[15], selector);
      w[59] = hc_byte_perm_S (w[15], w[14], selector);
      w[58] = hc_byte_perm_S (w[14], w[13], selector);
      w[57] = hc_byte_perm_S (w[13], w[12], selector);
      w[56] = hc_byte_perm_S (w[12], w[11], selector);
      w[55] = hc_byte_perm_S (w[11], w[10], selector);
      w[54] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[53] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[52] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[51] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[50] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[49] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[48] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[47] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[46] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[45] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[44] = hc_byte_perm_S (w[ 0],     0, selector);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_byte_perm_S (w[18], w[17], selector);
      w[62] = hc_byte_perm_S (w[17], w[16], selector);
      w[61] = hc_byte_perm_S (w[16], w[15], selector);
      w[60] = hc_byte_perm_S (w[15], w[14], selector);
      w[59] = hc_byte_perm_S (w[14], w[13], selector);
      w[58] = hc_byte_perm_S (w[13], w[12], selector);
      w[57] = hc_byte_perm_S (w[12], w[11], selector);
      w[56] = hc_byte_perm_S (w[11], w[10], selector);
      w[55] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[54] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[53] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[52] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[51] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[50] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[49] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[48] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[47] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[46] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[45] = hc_byte_perm_S (w[ 0],     0, selector);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_byte_perm_S (w[17], w[16], selector);
      w[62] = hc_byte_perm_S (w[16], w[15], selector);
      w[61] = hc_byte_perm_S (w[15], w[14], selector);
      w[60] = hc_byte_perm_S (w[14], w[13], selector);
      w[59] = hc_byte_perm_S (w[13], w[12], selector);
      w[58] = hc_byte_perm_S (w[12], w[11], selector);
      w[57] = hc_byte_perm_S (w[11], w[10], selector);
      w[56] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[55] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[54] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[53] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[52] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[51] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[50] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[49] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[48] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[47] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[46] = hc_byte_perm_S (w[ 0],     0, selector);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_byte_perm_S (w[16], w[15], selector);
      w[62] = hc_byte_perm_S (w[15], w[14], selector);
      w[61] = hc_byte_perm_S (w[14], w[13], selector);
      w[60] = hc_byte_perm_S (w[13], w[12], selector);
      w[59] = hc_byte_perm_S (w[12], w[11], selector);
      w[58] = hc_byte_perm_S (w[11], w[10], selector);
      w[57] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[56] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[55] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[54] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[53] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[52] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[51] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[50] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[49] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[48] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[47] = hc_byte_perm_S (w[ 0],     0, selector);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_byte_perm_S (w[15], w[14], selector);
      w[62] = hc_byte_perm_S (w[14], w[13], selector);
      w[61] = hc_byte_perm_S (w[13], w[12], selector);
      w[60] = hc_byte_perm_S (w[12], w[11], selector);
      w[59] = hc_byte_perm_S (w[11], w[10], selector);
      w[58] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[57] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[56] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[55] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[54] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[53] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[52] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[51] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[50] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[49] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[48] = hc_byte_perm_S (w[ 0],     0, selector);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_byte_perm_S (w[14], w[13], selector);
      w[62] = hc_byte_perm_S (w[13], w[12], selector);
      w[61] = hc_byte_perm_S (w[12], w[11], selector);
      w[60] = hc_byte_perm_S (w[11], w[10], selector);
      w[59] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[58] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[57] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[56] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[55] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[54] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[53] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[52] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[51] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[50] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[49] = hc_byte_perm_S (w[ 0],     0, selector);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_byte_perm_S (w[13], w[12], selector);
      w[62] = hc_byte_perm_S (w[12], w[11], selector);
      w[61] = hc_byte_perm_S (w[11], w[10], selector);
      w[60] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[59] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[58] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[57] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[56] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[55] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[54] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[53] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[52] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[51] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[50] = hc_byte_perm_S (w[ 0],     0, selector);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_byte_perm_S (w[12], w[11], selector);
      w[62] = hc_byte_perm_S (w[11], w[10], selector);
      w[61] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[60] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[59] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[58] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[57] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[56] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[55] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[54] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[53] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[52] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[51] = hc_byte_perm_S (w[ 0],     0, selector);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_byte_perm_S (w[11], w[10], selector);
      w[62] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[61] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[60] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[59] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[58] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[57] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[56] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[55] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[54] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[53] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[52] = hc_byte_perm_S (w[ 0],     0, selector);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_byte_perm_S (w[10], w[ 9], selector);
      w[62] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[61] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[60] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[59] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[58] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[57] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[56] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[55] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[54] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[53] = hc_byte_perm_S (w[ 0],     0, selector);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_byte_perm_S (w[ 9], w[ 8], selector);
      w[62] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[61] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[60] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[59] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[58] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[57] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[56] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[55] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[54] = hc_byte_perm_S (w[ 0],     0, selector);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_byte_perm_S (w[ 8], w[ 7], selector);
      w[62] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[61] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[60] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[59] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[58] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[57] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[56] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[55] = hc_byte_perm_S (w[ 0],     0, selector);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_byte_perm_S (w[ 7], w[ 6], selector);
      w[62] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[61] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[60] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[59] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[58] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[57] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[56] = hc_byte_perm_S (w[ 0],     0, selector);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_byte_perm_S (w[ 6], w[ 5], selector);
      w[62] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[61] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[60] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[59] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[58] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[57] = hc_byte_perm_S (w[ 0],     0, selector);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_byte_perm_S (w[ 5], w[ 4], selector);
      w[62] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[61] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[60] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[59] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[58] = hc_byte_perm_S (w[ 0],     0, selector);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_byte_perm_S (w[ 4], w[ 3], selector);
      w[62] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[61] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[60] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[59] = hc_byte_perm_S (w[ 0],     0, selector);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_byte_perm_S (w[ 3], w[ 2], selector);
      w[62] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[61] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[60] = hc_byte_perm_S (w[ 0],     0, selector);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_byte_perm_S (w[ 2], w[ 1], selector);
      w[62] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[61] = hc_byte_perm_S (w[ 0],     0, selector);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_byte_perm_S (w[ 1], w[ 0], selector);
      w[62] = hc_byte_perm_S (w[ 0],     0, selector);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_byte_perm_S (w[ 0],     0, selector);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif
}

/**
 * vector functions on scalar types (for inner loop usage)
 */

#define PACKVS2(sn,vn,e)  \
  sn[0] = vn[0].s##e;     \
  sn[1] = vn[1].s##e;

#define PACKSV2(sn,vn,e)  \
  vn[0].s##e = sn[0];     \
  vn[1].s##e = sn[1];

#define PACKVS24(s0,s1,v0,v1,e) \
  PACKVS4 (s0, v0, e);          \
  PACKVS4 (s1, v1, e);

#define PACKSV24(s0,s1,v0,v1,e) \
  PACKSV4 (s0, v0, e);          \
  PACKSV4 (s1, v1, e);

#define PACKVS4(sn,vn,e)  \
  sn[0] = vn[0].s##e;     \
  sn[1] = vn[1].s##e;     \
  sn[2] = vn[2].s##e;     \
  sn[3] = vn[3].s##e;

#define PACKSV4(sn,vn,e)  \
  vn[0].s##e = sn[0];     \
  vn[1].s##e = sn[1];     \
  vn[2].s##e = sn[2];     \
  vn[3].s##e = sn[3];

#define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
  PACKVS4 (s0, v0, e);                      \
  PACKVS4 (s1, v1, e);                      \
  PACKVS4 (s2, v2, e);                      \
  PACKVS4 (s3, v3, e);

#define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
  PACKSV4 (s0, v0, e);                      \
  PACKSV4 (s1, v1, e);                      \
  PACKSV4 (s2, v2, e);                      \
  PACKSV4 (s3, v3, e);

#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \
  PACKVS4 (s0, v0, e);                                              \
  PACKVS4 (s1, v1, e);                                              \
  PACKVS4 (s2, v2, e);                                              \
  PACKVS4 (s3, v3, e);                                              \
  PACKVS4 (s4, v4, e);                                              \
  PACKVS4 (s5, v5, e);                                              \
  PACKVS4 (s6, v6, e);                                              \
  PACKVS4 (s7, v7, e);

#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \
  PACKSV4 (s0, v0, e);                                              \
  PACKSV4 (s1, v1, e);                                              \
  PACKSV4 (s2, v2, e);                                              \
  PACKSV4 (s3, v3, e);                                              \
  PACKSV4 (s4, v4, e);                                              \
  PACKSV4 (s5, v5, e);                                              \
  PACKSV4 (s6, v6, e);                                              \
  PACKSV4 (s7, v7, e);

DECLSPEC void switch_buffer_by_offset_le_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset)
{
  #if VECT_SIZE == 1

  switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);

  #else

  u32 t0[4];
  u32 t1[4];
  u32 t2[4];
  u32 t3[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);

  #elif VECT_SIZE == 4

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);

  #elif VECT_SIZE == 8

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);

  #elif VECT_SIZE == 16

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);

  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_le_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32x offset)
{
  #if VECT_SIZE == 1

  switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset);

  #else

  u32 t0[4];
  u32 t1[4];
  u32 t2[4];
  u32 t3[4];
  u32 t4[4];
  u32 t5[4];
  u32 t6[4];
  u32 t7[4];

  #endif

  #if   VECT_SIZE == 2

  // 1
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);

  // 2
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);

  #elif VECT_SIZE == 4

  // 1
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);

  // 2
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);

  // 3
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);

  // 4
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);

  #elif VECT_SIZE == 8

  // 1
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);

  // 2
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);

  // 3
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);

  // 4
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);

  // 5
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4);

  // 6
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5);

  // 7
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6);

  // 8
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7);

  #elif VECT_SIZE == 16

  // 1
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0);

  // 2
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1);

  // 3
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2);

  // 4
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3);

  // 5
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4);

  // 6
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5);

  // 7
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6);

  // 8
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7);

  // 9
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8);

  // 10
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9);

  // 11
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a);

  // 12
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b);

  // 13
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c);

  // 14
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d);

  // 15
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e);

  // 16
  PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f);
  switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf);
  PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f);

  #endif
}

DECLSPEC void append_0x01_2x4_VV (u32x *w0, u32x *w1, const u32x offset)
{
  #if VECT_SIZE == 1

  append_0x01_2x4_S (w0, w1, offset);

  #else

  u32 t0[4];
  u32 t1[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);

  #elif VECT_SIZE == 4

  PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);

  #elif VECT_SIZE == 8

  PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);

  #elif VECT_SIZE == 16

  PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
  PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
  PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
  PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
  PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
  PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
  PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
  PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
  PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);

  #endif
}

DECLSPEC void append_0x01_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset)
{
  #if VECT_SIZE == 1

  append_0x01_4x4_S (w0, w1, w2, w3, offset);

  #else

  u32 t0[4];
  u32 t1[4];
  u32 t2[4];
  u32 t3[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x01_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x01_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);

  #elif VECT_SIZE == 4

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x01_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x01_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x01_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x01_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);

  #elif VECT_SIZE == 8

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x01_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x01_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x01_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x01_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x01_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x01_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x01_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x01_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);

  #elif VECT_SIZE == 16

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x01_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x01_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x01_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x01_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x01_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x01_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x01_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x01_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x01_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x01_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x01_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x01_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x01_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x01_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x01_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x01_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);

  #endif
}

DECLSPEC void append_0x06_2x4_VV (u32x *w0, u32x *w1, const u32x offset)
{
  #if VECT_SIZE == 1

  append_0x06_2x4_S (w0, w1, offset);

  #else

  u32 t0[4];
  u32 t1[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);

  #elif VECT_SIZE == 4

  PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);

  #elif VECT_SIZE == 8

  PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x06_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x06_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x06_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x06_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);

  #elif VECT_SIZE == 16

  PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x06_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x06_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x06_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x06_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
  PACKVS24 (t0, t1, w0, w1, 8); append_0x06_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
  PACKVS24 (t0, t1, w0, w1, 9); append_0x06_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
  PACKVS24 (t0, t1, w0, w1, a); append_0x06_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
  PACKVS24 (t0, t1, w0, w1, b); append_0x06_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
  PACKVS24 (t0, t1, w0, w1, c); append_0x06_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
  PACKVS24 (t0, t1, w0, w1, d); append_0x06_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
  PACKVS24 (t0, t1, w0, w1, e); append_0x06_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
  PACKVS24 (t0, t1, w0, w1, f); append_0x06_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);

  #endif
}

DECLSPEC void append_0x80_2x4_VV (u32x *w0, u32x *w1, const u32x offset)
{
  #if VECT_SIZE == 1

  append_0x80_2x4_S (w0, w1, offset);

  #else

  u32 t0[4];
  u32 t1[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);

  #elif VECT_SIZE == 4

  PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);

  #elif VECT_SIZE == 8

  PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);

  #elif VECT_SIZE == 16

  PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
  PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
  PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
  PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
  PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
  PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
  PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
  PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
  PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
  PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
  PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
  PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
  PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
  PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
  PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
  PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);

  #endif
}

DECLSPEC void append_0x80_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset)
{
  #if VECT_SIZE == 1

  append_0x80_4x4_S (w0, w1, w2, w3, offset);

  #else

  u32 t0[4];
  u32 t1[4];
  u32 t2[4];
  u32 t3[4];

  #endif

  #if   VECT_SIZE == 2

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);

  #elif VECT_SIZE == 4

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);

  #elif VECT_SIZE == 8

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);

  #elif VECT_SIZE == 16

  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
  PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);

  #endif
}

DECLSPEC void gpu_decompress_entry (__global pw_idx_t *pws_idx, __global u32 *pws_comp, pw_t *pw, const u64 gid)
{
  const u32 off = pws_idx[gid].off;
  const u32 cnt = pws_idx[gid].cnt;
  const u32 len = pws_idx[gid].len;

  #pragma unroll
  for (u32 i = 0; i < 64; i++)
  {
    pw->i[i] = 0;
  }

  for (u32 i = 0, j = off; i < cnt; i++, j++)
  {
    pw->i[i] = pws_comp[j];
  }

  pw->pw_len = len;
}

__kernel void gpu_decompress (__global pw_idx_t * restrict pws_idx, __global u32 * restrict pws_comp, __global pw_t * restrict pws_buf, const u64 gid_max)
{
  const u64 gid = get_global_id (0);

  if (gid >= gid_max) return;

  pw_t pw;

  gpu_decompress_entry (pws_idx, pws_comp, &pw, gid);

  pws_buf[gid] = pw;
}

__kernel void gpu_memset (__global uint4 * restrict buf, const u32 value, const u64 gid_max)
{
  const u64 gid = get_global_id (0);

  if (gid >= gid_max) return;

  buf[gid] = (uint4) (value);
}

__kernel void gpu_atinit (__global pw_t * restrict buf, const u64 gid_max)
{
  const u64 gid = get_global_id (0);

  if (gid >= gid_max) return;

  const u32 l32 = l32_from_64_S (gid);
  const u32 h32 = h32_from_64_S (gid);

  pw_t pw;

  pw.i[ 0] = 0x5c5c5c5c ^ l32;
  pw.i[ 1] = 0x36363636 ^ h32;
  pw.i[ 2] = 0;
  pw.i[ 3] = 0;
  pw.i[ 4] = 0;
  pw.i[ 5] = 0;
  pw.i[ 6] = 0;
  pw.i[ 7] = 0;
  pw.i[ 8] = 0;
  pw.i[ 9] = 0;
  pw.i[10] = 0;
  pw.i[11] = 0;
  pw.i[12] = 0;
  pw.i[13] = 0;
  pw.i[14] = 0;
  pw.i[15] = 0;
  pw.i[16] = 0;
  pw.i[17] = 0;
  pw.i[18] = 0;
  pw.i[19] = 0;
  pw.i[20] = 0;
  pw.i[21] = 0;
  pw.i[22] = 0;
  pw.i[23] = 0;
  pw.i[24] = 0;
  pw.i[25] = 0;
  pw.i[26] = 0;
  pw.i[27] = 0;
  pw.i[28] = 0;
  pw.i[29] = 0;
  pw.i[30] = 0;
  pw.i[31] = 0;
  pw.i[32] = 0;
  pw.i[33] = 0;
  pw.i[34] = 0;
  pw.i[35] = 0;
  pw.i[36] = 0;
  pw.i[37] = 0;
  pw.i[38] = 0;
  pw.i[39] = 0;
  pw.i[40] = 0;
  pw.i[41] = 0;
  pw.i[42] = 0;
  pw.i[43] = 0;
  pw.i[44] = 0;
  pw.i[45] = 0;
  pw.i[46] = 0;
  pw.i[47] = 0;
  pw.i[48] = 0;
  pw.i[49] = 0;
  pw.i[50] = 0;
  pw.i[51] = 0;
  pw.i[52] = 0;
  pw.i[53] = 0;
  pw.i[54] = 0;
  pw.i[55] = 0;
  pw.i[56] = 0;
  pw.i[57] = 0;
  pw.i[58] = 0;
  pw.i[59] = 0;
  pw.i[60] = 0;
  pw.i[61] = 0;
  pw.i[62] = 0;
  pw.i[63] = 0; // yep that's faster

  //pw.pw_len = 1 + (l32 & 15);
  pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)

  buf[gid] = pw;
}