From 3a23b275e5308677957388684dceb4311cae6bac Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 17 Feb 2018 14:24:29 +0100 Subject: [PATCH] Improved c_append_helper[] handling --- OpenCL/inc_common.cl | 1630 +++++------------------------------- OpenCL/inc_rp_optimized.cl | 85 +- 2 files changed, 259 insertions(+), 1456 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 58f4c11ea..ce5978024 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -3,72 +3,24 @@ * License.....: MIT */ -__constant u32a c_append_helper[64][16] = +__constant u32a c_append_helper_mini[16][4] = { - { 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000, 0x00000000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000ff }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ff00 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00ff0000 }, - { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff000000 }, + { 0x000000ff, 0x00000000, 0x00000000, 0x00000000 }, + { 0x0000ff00, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00ff0000, 0x00000000, 0x00000000, 0x00000000 }, + { 0xff000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0x000000ff, 0x00000000, 0x00000000 }, + { 0x00000000, 0x0000ff00, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00ff0000, 0x00000000, 0x00000000 }, + { 0x00000000, 0xff000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0x000000ff, 0x00000000 }, + { 0x00000000, 0x00000000, 0x0000ff00, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00ff0000, 0x00000000 }, + { 0x00000000, 0x00000000, 0xff000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0x000000ff }, + { 0x00000000, 0x00000000, 0x00000000, 0x0000ff00 }, + { 0x00000000, 0x00000000, 0x00000000, 0x00ff0000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xff000000 }, }; /** @@ -444,839 +396,124 @@ DECLSPEC void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } +DECLSPEC void append_helper_1x4 (u32x r[4], const u32 v, const u32 m[4]) +{ + r[0] |= v & m[0]; + r[1] |= v & m[1]; + r[2] |= v & m[2]; + r[3] |= v & m[3]; +} + DECLSPEC void append_0x80_1x4 (u32x w0[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][0]; - w0[1] |= 0x80808080 & c_append_helper[offset][1]; - w0[2] |= 0x80808080 & c_append_helper[offset][2]; - w0[3] |= 0x80808080 & c_append_helper[offset][3]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + append_helper_1x4 (w0, 0x80808080, v); } DECLSPEC void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][0]; - w0[1] |= 0x80808080 & c_append_helper[offset][1]; - w0[2] |= 0x80808080 & c_append_helper[offset][2]; - w0[3] |= 0x80808080 & c_append_helper[offset][3]; - w1[0] |= 0x80808080 & c_append_helper[offset][4]; - w1[1] |= 0x80808080 & c_append_helper[offset][5]; - w1[2] |= 0x80808080 & c_append_helper[offset][6]; - w1[3] |= 0x80808080 & c_append_helper[offset][7]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4 (w0, 0x80808080, v); break; + case 1: append_helper_1x4 (w1, 0x80808080, v); break; + } } DECLSPEC void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][ 0]; - w0[1] |= 0x80808080 & c_append_helper[offset][ 1]; - w0[2] |= 0x80808080 & c_append_helper[offset][ 2]; - w0[3] |= 0x80808080 & c_append_helper[offset][ 3]; - w1[0] |= 0x80808080 & c_append_helper[offset][ 4]; - w1[1] |= 0x80808080 & c_append_helper[offset][ 5]; - w1[2] |= 0x80808080 & c_append_helper[offset][ 6]; - w1[3] |= 0x80808080 & c_append_helper[offset][ 7]; - w2[0] |= 0x80808080 & c_append_helper[offset][ 8]; - w2[1] |= 0x80808080 & c_append_helper[offset][ 9]; - w2[2] |= 0x80808080 & c_append_helper[offset][10]; - w2[3] |= 0x80808080 & c_append_helper[offset][11]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4 (w0, 0x80808080, v); break; + case 1: append_helper_1x4 (w1, 0x80808080, v); break; + case 2: append_helper_1x4 (w2, 0x80808080, v); break; + } } DECLSPEC void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][ 0]; - w0[1] |= 0x80808080 & c_append_helper[offset][ 1]; - w0[2] |= 0x80808080 & c_append_helper[offset][ 2]; - w0[3] |= 0x80808080 & c_append_helper[offset][ 3]; - w1[0] |= 0x80808080 & c_append_helper[offset][ 4]; - w1[1] |= 0x80808080 & c_append_helper[offset][ 5]; - w1[2] |= 0x80808080 & c_append_helper[offset][ 6]; - w1[3] |= 0x80808080 & c_append_helper[offset][ 7]; - w2[0] |= 0x80808080 & c_append_helper[offset][ 8]; - w2[1] |= 0x80808080 & c_append_helper[offset][ 9]; - w2[2] |= 0x80808080 & c_append_helper[offset][10]; - w2[3] |= 0x80808080 & c_append_helper[offset][11]; - w3[0] |= 0x80808080 & c_append_helper[offset][12]; - w3[1] |= 0x80808080 & c_append_helper[offset][13]; - w3[2] |= 0x80808080 & c_append_helper[offset][14]; - w3[3] |= 0x80808080 & c_append_helper[offset][15]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4 (w0, 0x80808080, v); break; + case 1: append_helper_1x4 (w1, 0x80808080, v); break; + case 2: append_helper_1x4 (w2, 0x80808080, v); break; + case 3: append_helper_1x4 (w3, 0x80808080, v); break; + } } DECLSPEC void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { - switch (offset) + const u32 v[4] = { - case 0: - w0[0] = w0[0] | 0x80; - break; - - case 1: - w0[0] = w0[0] | 0x8000; - break; - - case 2: - w0[0] = w0[0] | 0x800000; - break; - - case 3: - w0[0] = w0[0] | 0x80000000; - break; - - case 4: - w0[1] = w0[1] | 0x80; - break; - - case 5: - w0[1] = w0[1] | 0x8000; - break; - - case 6: - w0[1] = w0[1] | 0x800000; - break; - - case 7: - w0[1] = w0[1] | 0x80000000; - break; - - case 8: - w0[2] = w0[2] | 0x80; - break; - - case 9: - w0[2] = w0[2] | 0x8000; - break; - - case 10: - w0[2] = w0[2] | 0x800000; - break; - - case 11: - w0[2] = w0[2] | 0x80000000; - break; - - case 12: - w0[3] = w0[3] | 0x80; - break; - - case 13: - w0[3] = w0[3] | 0x8000; - break; - - case 14: - w0[3] = w0[3] | 0x800000; - break; - - case 15: - w0[3] = w0[3] | 0x80000000; - break; - - case 16: - w1[0] = w1[0] | 0x80; - break; - - case 17: - w1[0] = w1[0] | 0x8000; - break; - - case 18: - w1[0] = w1[0] | 0x800000; - break; - - case 19: - w1[0] = w1[0] | 0x80000000; - break; - - case 20: - w1[1] = w1[1] | 0x80; - break; - - case 21: - w1[1] = w1[1] | 0x8000; - break; - - case 22: - w1[1] = w1[1] | 0x800000; - break; - - case 23: - w1[1] = w1[1] | 0x80000000; - break; - - case 24: - w1[2] = w1[2] | 0x80; - break; - - case 25: - w1[2] = w1[2] | 0x8000; - break; - - case 26: - w1[2] = w1[2] | 0x800000; - break; - - case 27: - w1[2] = w1[2] | 0x80000000; - break; - - case 28: - w1[3] = w1[3] | 0x80; - break; - - case 29: - w1[3] = w1[3] | 0x8000; - break; - - case 30: - w1[3] = w1[3] | 0x800000; - break; - - case 31: - w1[3] = w1[3] | 0x80000000; - break; - - case 32: - w2[0] = w2[0] | 0x80; - break; - - case 33: - w2[0] = w2[0] | 0x8000; - break; - - case 34: - w2[0] = w2[0] | 0x800000; - break; - - case 35: - w2[0] = w2[0] | 0x80000000; - break; - - case 36: - w2[1] = w2[1] | 0x80; - break; - - case 37: - w2[1] = w2[1] | 0x8000; - break; - - case 38: - w2[1] = w2[1] | 0x800000; - break; - - case 39: - w2[1] = w2[1] | 0x80000000; - break; - - case 40: - w2[2] = w2[2] | 0x80; - break; - - case 41: - w2[2] = w2[2] | 0x8000; - break; - - case 42: - w2[2] = w2[2] | 0x800000; - break; - - case 43: - w2[2] = w2[2] | 0x80000000; - break; - - case 44: - w2[3] = w2[3] | 0x80; - break; - - case 45: - w2[3] = w2[3] | 0x8000; - break; - - case 46: - w2[3] = w2[3] | 0x800000; - break; - - case 47: - w2[3] = w2[3] | 0x80000000; - break; - - case 48: - w3[0] = w3[0] | 0x80; - break; - - case 49: - w3[0] = w3[0] | 0x8000; - break; - - case 50: - w3[0] = w3[0] | 0x800000; - break; - - case 51: - w3[0] = w3[0] | 0x80000000; - break; - - case 52: - w3[1] = w3[1] | 0x80; - break; - - case 53: - w3[1] = w3[1] | 0x8000; - break; - - case 54: - w3[1] = w3[1] | 0x800000; - break; - - case 55: - w3[1] = w3[1] | 0x80000000; - break; - - case 56: - w3[2] = w3[2] | 0x80; - break; - - case 57: - w3[2] = w3[2] | 0x8000; - break; - - case 58: - w3[2] = w3[2] | 0x800000; - break; - - case 59: - w3[2] = w3[2] | 0x80000000; - break; - - case 60: - w3[3] = w3[3] | 0x80; - break; - - case 61: - w3[3] = w3[3] | 0x8000; - break; - - case 62: - w3[3] = w3[3] | 0x800000; - break; - - case 63: - w3[3] = w3[3] | 0x80000000; - break; - - case 64: - w4[0] = w4[0] | 0x80; - break; - - case 65: - w4[0] = w4[0] | 0x8000; - break; - - case 66: - w4[0] = w4[0] | 0x800000; - break; - - case 67: - w4[0] = w4[0] | 0x80000000; - break; - - case 68: - w4[1] = w4[1] | 0x80; - break; - - case 69: - w4[1] = w4[1] | 0x8000; - break; - - case 70: - w4[1] = w4[1] | 0x800000; - break; - - case 71: - w4[1] = w4[1] | 0x80000000; - break; - - case 72: - w4[2] = w4[2] | 0x80; - break; - - case 73: - w4[2] = w4[2] | 0x8000; - break; - - case 74: - w4[2] = w4[2] | 0x800000; - break; - - case 75: - w4[2] = w4[2] | 0x80000000; - break; - - case 76: - w4[3] = w4[3] | 0x80; - break; - - case 77: - w4[3] = w4[3] | 0x8000; - break; - - case 78: - w4[3] = w4[3] | 0x800000; - break; - - case 79: - w4[3] = w4[3] | 0x80000000; - break; - - case 80: - w5[0] = w5[0] | 0x80; - break; - - case 81: - w5[0] = w5[0] | 0x8000; - break; - - case 82: - w5[0] = w5[0] | 0x800000; - break; - - case 83: - w5[0] = w5[0] | 0x80000000; - break; - - case 84: - w5[1] = w5[1] | 0x80; - break; - - case 85: - w5[1] = w5[1] | 0x8000; - break; - - case 86: - w5[1] = w5[1] | 0x800000; - break; - - case 87: - w5[1] = w5[1] | 0x80000000; - break; - - case 88: - w5[2] = w5[2] | 0x80; - break; - - case 89: - w5[2] = w5[2] | 0x8000; - break; - - case 90: - w5[2] = w5[2] | 0x800000; - break; - - case 91: - w5[2] = w5[2] | 0x80000000; - break; - - case 92: - w5[3] = w5[3] | 0x80; - break; - - case 93: - w5[3] = w5[3] | 0x8000; - break; - - case 94: - w5[3] = w5[3] | 0x800000; - break; - - case 95: - w5[3] = w5[3] | 0x80000000; - break; - - case 96: - w6[0] = w6[0] | 0x80; - break; - - case 97: - w6[0] = w6[0] | 0x8000; - break; - - case 98: - w6[0] = w6[0] | 0x800000; - break; - - case 99: - w6[0] = w6[0] | 0x80000000; - break; - - case 100: - w6[1] = w6[1] | 0x80; - break; - - case 101: - w6[1] = w6[1] | 0x8000; - break; - - case 102: - w6[1] = w6[1] | 0x800000; - break; - - case 103: - w6[1] = w6[1] | 0x80000000; - break; - - case 104: - w6[2] = w6[2] | 0x80; - break; - - case 105: - w6[2] = w6[2] | 0x8000; - break; - - case 106: - w6[2] = w6[2] | 0x800000; - break; - - case 107: - w6[2] = w6[2] | 0x80000000; - break; - - case 108: - w6[3] = w6[3] | 0x80; - break; - - case 109: - w6[3] = w6[3] | 0x8000; - break; - - case 110: - w6[3] = w6[3] | 0x800000; - break; - - case 111: - w6[3] = w6[3] | 0x80000000; - break; - - case 112: - w7[0] = w7[0] | 0x80; - break; - - case 113: - w7[0] = w7[0] | 0x8000; - break; - - case 114: - w7[0] = w7[0] | 0x800000; - break; - - case 115: - w7[0] = w7[0] | 0x80000000; - break; - - case 116: - w7[1] = w7[1] | 0x80; - break; - - case 117: - w7[1] = w7[1] | 0x8000; - break; - - case 118: - w7[1] = w7[1] | 0x800000; - break; - - case 119: - w7[1] = w7[1] | 0x80000000; - break; - - case 120: - w7[2] = w7[2] | 0x80; - break; - - case 121: - w7[2] = w7[2] | 0x8000; - break; - - case 122: - w7[2] = w7[2] | 0x800000; - break; - - case 123: - w7[2] = w7[2] | 0x80000000; - break; - - case 124: - w7[3] = w7[3] | 0x80; - break; - - case 125: - w7[3] = w7[3] | 0x8000; - break; - - case 126: - w7[3] = w7[3] | 0x800000; - break; - - case 127: - w7[3] = w7[3] | 0x80000000; - break; + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4 (w0, 0x80808080, v); break; + case 1: append_helper_1x4 (w1, 0x80808080, v); break; + case 2: append_helper_1x4 (w2, 0x80808080, v); break; + case 3: append_helper_1x4 (w3, 0x80808080, v); break; + case 4: append_helper_1x4 (w4, 0x80808080, v); break; + case 5: append_helper_1x4 (w5, 0x80808080, v); break; + case 6: append_helper_1x4 (w6, 0x80808080, v); break; + case 7: append_helper_1x4 (w7, 0x80808080, v); break; } } DECLSPEC void append_0x80_1x16 (u32x w[16], const u32 offset) { - switch (offset) + const u32 v[4] = { - case 0: - w[ 0] = 0x80; - break; + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; - case 1: - w[ 0] = w[ 0] | 0x8000; - break; - - case 2: - w[ 0] = w[ 0] | 0x800000; - break; - - case 3: - w[ 0] = w[ 0] | 0x80000000; - break; - - case 4: - w[ 1] = 0x80; - break; - - case 5: - w[ 1] = w[ 1] | 0x8000; - break; - - case 6: - w[ 1] = w[ 1] | 0x800000; - break; - - case 7: - w[ 1] = w[ 1] | 0x80000000; - break; - - case 8: - w[ 2] = 0x80; - break; - - case 9: - w[ 2] = w[ 2] | 0x8000; - break; - - case 10: - w[ 2] = w[ 2] | 0x800000; - break; - - case 11: - w[ 2] = w[ 2] | 0x80000000; - break; - - case 12: - w[ 3] = 0x80; - break; - - case 13: - w[ 3] = w[ 3] | 0x8000; - break; - - case 14: - w[ 3] = w[ 3] | 0x800000; - break; - - case 15: - w[ 3] = w[ 3] | 0x80000000; - break; - - case 16: - w[ 4] = 0x80; - break; - - case 17: - w[ 4] = w[ 4] | 0x8000; - break; - - case 18: - w[ 4] = w[ 4] | 0x800000; - break; - - case 19: - w[ 4] = w[ 4] | 0x80000000; - break; - - case 20: - w[ 5] = 0x80; - break; - - case 21: - w[ 5] = w[ 5] | 0x8000; - break; - - case 22: - w[ 5] = w[ 5] | 0x800000; - break; - - case 23: - w[ 5] = w[ 5] | 0x80000000; - break; - - case 24: - w[ 6] = 0x80; - break; - - case 25: - w[ 6] = w[ 6] | 0x8000; - break; - - case 26: - w[ 6] = w[ 6] | 0x800000; - break; - - case 27: - w[ 6] = w[ 6] | 0x80000000; - break; - - case 28: - w[ 7] = 0x80; - break; - - case 29: - w[ 7] = w[ 7] | 0x8000; - break; - - case 30: - w[ 7] = w[ 7] | 0x800000; - break; - - case 31: - w[ 7] = w[ 7] | 0x80000000; - break; - - case 32: - w[ 8] = 0x80; - break; - - case 33: - w[ 8] = w[ 8] | 0x8000; - break; - - case 34: - w[ 8] = w[ 8] | 0x800000; - break; - - case 35: - w[ 8] = w[ 8] | 0x80000000; - break; - - case 36: - w[ 9] = 0x80; - break; - - case 37: - w[ 9] = w[ 9] | 0x8000; - break; - - case 38: - w[ 9] = w[ 9] | 0x800000; - break; - - case 39: - w[ 9] = w[ 9] | 0x80000000; - break; - - case 40: - w[10] = 0x80; - break; - - case 41: - w[10] = w[10] | 0x8000; - break; - - case 42: - w[10] = w[10] | 0x800000; - break; - - case 43: - w[10] = w[10] | 0x80000000; - break; - - case 44: - w[11] = 0x80; - break; - - case 45: - w[11] = w[11] | 0x8000; - break; - - case 46: - w[11] = w[11] | 0x800000; - break; - - case 47: - w[11] = w[11] | 0x80000000; - break; - - case 48: - w[12] = 0x80; - break; - - case 49: - w[12] = w[12] | 0x8000; - break; - - case 50: - w[12] = w[12] | 0x800000; - break; - - case 51: - w[12] = w[12] | 0x80000000; - break; - - case 52: - w[13] = 0x80; - break; - - case 53: - w[13] = w[13] | 0x8000; - break; - - case 54: - w[13] = w[13] | 0x800000; - break; - - case 55: - w[13] = w[13] | 0x80000000; - break; - - case 56: - w[14] = 0x80; - break; - - case 57: - w[14] = w[14] | 0x8000; - break; - - case 58: - w[14] = w[14] | 0x800000; - break; - - case 59: - w[14] = w[14] | 0x80000000; - break; - - case 60: - w[15] = 0x80; - break; - - case 61: - w[15] = w[15] | 0x8000; - break; - - case 62: - w[15] = w[15] | 0x800000; - break; - - case 63: - w[15] = w[15] | 0x80000000; - break; + switch (offset / 16) + { + case 0: append_helper_1x4 (w + 0, 0x80808080, v); break; + case 1: append_helper_1x4 (w + 4, 0x80808080, v); break; + case 2: append_helper_1x4 (w + 8, 0x80808080, v); break; + case 3: append_helper_1x4 (w + 12, 0x80808080, v); break; + case 4: append_helper_1x4 (w + 16, 0x80808080, v); break; + case 5: append_helper_1x4 (w + 20, 0x80808080, v); break; + case 6: append_helper_1x4 (w + 24, 0x80808080, v); break; + case 7: append_helper_1x4 (w + 28, 0x80808080, v); break; } } @@ -31726,589 +30963,118 @@ DECLSPEC void truncate_block_16x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[ } } +DECLSPEC void append_helper_1x4_S (u32 r[4], const u32 v, const u32 m[4]) +{ + r[0] |= v & m[0]; + r[1] |= v & m[1]; + r[2] |= v & m[2]; + r[3] |= v & m[3]; +} + DECLSPEC void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { - w0[0] |= 0x01010101 & c_append_helper[offset][0]; - w0[1] |= 0x01010101 & c_append_helper[offset][1]; - w0[2] |= 0x01010101 & c_append_helper[offset][2]; - w0[3] |= 0x01010101 & c_append_helper[offset][3]; - w1[0] |= 0x01010101 & c_append_helper[offset][4]; - w1[1] |= 0x01010101 & c_append_helper[offset][5]; - w1[2] |= 0x01010101 & c_append_helper[offset][6]; - w1[3] |= 0x01010101 & c_append_helper[offset][7]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4_S (w0, 0x01010101, v); break; + case 1: append_helper_1x4_S (w1, 0x01010101, v); break; + } } DECLSPEC void append_0x80_1x4_S (u32 w0[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][0]; - w0[1] |= 0x80808080 & c_append_helper[offset][1]; - w0[2] |= 0x80808080 & c_append_helper[offset][2]; - w0[3] |= 0x80808080 & c_append_helper[offset][3]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + append_helper_1x4_S (w0, 0x80808080, v); } DECLSPEC void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][0]; - w0[1] |= 0x80808080 & c_append_helper[offset][1]; - w0[2] |= 0x80808080 & c_append_helper[offset][2]; - w0[3] |= 0x80808080 & c_append_helper[offset][3]; - w1[0] |= 0x80808080 & c_append_helper[offset][4]; - w1[1] |= 0x80808080 & c_append_helper[offset][5]; - w1[2] |= 0x80808080 & c_append_helper[offset][6]; - w1[3] |= 0x80808080 & c_append_helper[offset][7]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4_S (w0, 0x80808080, v); break; + case 1: append_helper_1x4_S (w1, 0x80808080, v); break; + } } DECLSPEC void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][ 0]; - w0[1] |= 0x80808080 & c_append_helper[offset][ 1]; - w0[2] |= 0x80808080 & c_append_helper[offset][ 2]; - w0[3] |= 0x80808080 & c_append_helper[offset][ 3]; - w1[0] |= 0x80808080 & c_append_helper[offset][ 4]; - w1[1] |= 0x80808080 & c_append_helper[offset][ 5]; - w1[2] |= 0x80808080 & c_append_helper[offset][ 6]; - w1[3] |= 0x80808080 & c_append_helper[offset][ 7]; - w2[0] |= 0x80808080 & c_append_helper[offset][ 8]; - w2[1] |= 0x80808080 & c_append_helper[offset][ 9]; - w2[2] |= 0x80808080 & c_append_helper[offset][10]; - w2[3] |= 0x80808080 & c_append_helper[offset][11]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4_S (w0, 0x80808080, v); break; + case 1: append_helper_1x4_S (w1, 0x80808080, v); break; + case 2: append_helper_1x4_S (w2, 0x80808080, v); break; + } } DECLSPEC void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - w0[0] |= 0x80808080 & c_append_helper[offset][ 0]; - w0[1] |= 0x80808080 & c_append_helper[offset][ 1]; - w0[2] |= 0x80808080 & c_append_helper[offset][ 2]; - w0[3] |= 0x80808080 & c_append_helper[offset][ 3]; - w1[0] |= 0x80808080 & c_append_helper[offset][ 4]; - w1[1] |= 0x80808080 & c_append_helper[offset][ 5]; - w1[2] |= 0x80808080 & c_append_helper[offset][ 6]; - w1[3] |= 0x80808080 & c_append_helper[offset][ 7]; - w2[0] |= 0x80808080 & c_append_helper[offset][ 8]; - w2[1] |= 0x80808080 & c_append_helper[offset][ 9]; - w2[2] |= 0x80808080 & c_append_helper[offset][10]; - w2[3] |= 0x80808080 & c_append_helper[offset][11]; - w3[0] |= 0x80808080 & c_append_helper[offset][12]; - w3[1] |= 0x80808080 & c_append_helper[offset][13]; - w3[2] |= 0x80808080 & c_append_helper[offset][14]; - w3[3] |= 0x80808080 & c_append_helper[offset][15]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4_S (w0, 0x80808080, v); break; + case 1: append_helper_1x4_S (w1, 0x80808080, v); break; + case 2: append_helper_1x4_S (w2, 0x80808080, v); break; + case 3: append_helper_1x4_S (w3, 0x80808080, v); break; + } } DECLSPEC void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { - switch (offset) + const u32 v[4] = { - case 0: - w0[0] = w0[0] | 0x80; - break; - - case 1: - w0[0] = w0[0] | 0x8000; - break; - - case 2: - w0[0] = w0[0] | 0x800000; - break; - - case 3: - w0[0] = w0[0] | 0x80000000; - break; - - case 4: - w0[1] = w0[1] | 0x80; - break; - - case 5: - w0[1] = w0[1] | 0x8000; - break; - - case 6: - w0[1] = w0[1] | 0x800000; - break; - - case 7: - w0[1] = w0[1] | 0x80000000; - break; - - case 8: - w0[2] = w0[2] | 0x80; - break; - - case 9: - w0[2] = w0[2] | 0x8000; - break; - - case 10: - w0[2] = w0[2] | 0x800000; - break; - - case 11: - w0[2] = w0[2] | 0x80000000; - break; - - case 12: - w0[3] = w0[3] | 0x80; - break; - - case 13: - w0[3] = w0[3] | 0x8000; - break; - - case 14: - w0[3] = w0[3] | 0x800000; - break; - - case 15: - w0[3] = w0[3] | 0x80000000; - break; - - case 16: - w1[0] = w1[0] | 0x80; - break; - - case 17: - w1[0] = w1[0] | 0x8000; - break; - - case 18: - w1[0] = w1[0] | 0x800000; - break; - - case 19: - w1[0] = w1[0] | 0x80000000; - break; - - case 20: - w1[1] = w1[1] | 0x80; - break; - - case 21: - w1[1] = w1[1] | 0x8000; - break; - - case 22: - w1[1] = w1[1] | 0x800000; - break; - - case 23: - w1[1] = w1[1] | 0x80000000; - break; - - case 24: - w1[2] = w1[2] | 0x80; - break; - - case 25: - w1[2] = w1[2] | 0x8000; - break; - - case 26: - w1[2] = w1[2] | 0x800000; - break; - - case 27: - w1[2] = w1[2] | 0x80000000; - break; - - case 28: - w1[3] = w1[3] | 0x80; - break; - - case 29: - w1[3] = w1[3] | 0x8000; - break; - - case 30: - w1[3] = w1[3] | 0x800000; - break; - - case 31: - w1[3] = w1[3] | 0x80000000; - break; - - case 32: - w2[0] = w2[0] | 0x80; - break; - - case 33: - w2[0] = w2[0] | 0x8000; - break; - - case 34: - w2[0] = w2[0] | 0x800000; - break; - - case 35: - w2[0] = w2[0] | 0x80000000; - break; - - case 36: - w2[1] = w2[1] | 0x80; - break; - - case 37: - w2[1] = w2[1] | 0x8000; - break; - - case 38: - w2[1] = w2[1] | 0x800000; - break; - - case 39: - w2[1] = w2[1] | 0x80000000; - break; - - case 40: - w2[2] = w2[2] | 0x80; - break; - - case 41: - w2[2] = w2[2] | 0x8000; - break; - - case 42: - w2[2] = w2[2] | 0x800000; - break; - - case 43: - w2[2] = w2[2] | 0x80000000; - break; - - case 44: - w2[3] = w2[3] | 0x80; - break; - - case 45: - w2[3] = w2[3] | 0x8000; - break; - - case 46: - w2[3] = w2[3] | 0x800000; - break; - - case 47: - w2[3] = w2[3] | 0x80000000; - break; - - case 48: - w3[0] = w3[0] | 0x80; - break; - - case 49: - w3[0] = w3[0] | 0x8000; - break; - - case 50: - w3[0] = w3[0] | 0x800000; - break; - - case 51: - w3[0] = w3[0] | 0x80000000; - break; - - case 52: - w3[1] = w3[1] | 0x80; - break; - - case 53: - w3[1] = w3[1] | 0x8000; - break; - - case 54: - w3[1] = w3[1] | 0x800000; - break; - - case 55: - w3[1] = w3[1] | 0x80000000; - break; - - case 56: - w3[2] = w3[2] | 0x80; - break; - - case 57: - w3[2] = w3[2] | 0x8000; - break; - - case 58: - w3[2] = w3[2] | 0x800000; - break; - - case 59: - w3[2] = w3[2] | 0x80000000; - break; - - case 60: - w3[3] = w3[3] | 0x80; - break; - - case 61: - w3[3] = w3[3] | 0x8000; - break; - - case 62: - w3[3] = w3[3] | 0x800000; - break; - - case 63: - w3[3] = w3[3] | 0x80000000; - break; - - case 64: - w4[0] = w4[0] | 0x80; - break; - - case 65: - w4[0] = w4[0] | 0x8000; - break; - - case 66: - w4[0] = w4[0] | 0x800000; - break; - - case 67: - w4[0] = w4[0] | 0x80000000; - break; - - case 68: - w4[1] = w4[1] | 0x80; - break; - - case 69: - w4[1] = w4[1] | 0x8000; - break; - - case 70: - w4[1] = w4[1] | 0x800000; - break; - - case 71: - w4[1] = w4[1] | 0x80000000; - break; - - case 72: - w4[2] = w4[2] | 0x80; - break; - - case 73: - w4[2] = w4[2] | 0x8000; - break; - - case 74: - w4[2] = w4[2] | 0x800000; - break; - - case 75: - w4[2] = w4[2] | 0x80000000; - break; - - case 76: - w4[3] = w4[3] | 0x80; - break; - - case 77: - w4[3] = w4[3] | 0x8000; - break; - - case 78: - w4[3] = w4[3] | 0x800000; - break; - - case 79: - w4[3] = w4[3] | 0x80000000; - break; - - case 80: - w5[0] = w5[0] | 0x80; - break; - - case 81: - w5[0] = w5[0] | 0x8000; - break; - - case 82: - w5[0] = w5[0] | 0x800000; - break; - - case 83: - w5[0] = w5[0] | 0x80000000; - break; - - case 84: - w5[1] = w5[1] | 0x80; - break; - - case 85: - w5[1] = w5[1] | 0x8000; - break; - - case 86: - w5[1] = w5[1] | 0x800000; - break; - - case 87: - w5[1] = w5[1] | 0x80000000; - break; - - case 88: - w5[2] = w5[2] | 0x80; - break; - - case 89: - w5[2] = w5[2] | 0x8000; - break; - - case 90: - w5[2] = w5[2] | 0x800000; - break; - - case 91: - w5[2] = w5[2] | 0x80000000; - break; - - case 92: - w5[3] = w5[3] | 0x80; - break; - - case 93: - w5[3] = w5[3] | 0x8000; - break; - - case 94: - w5[3] = w5[3] | 0x800000; - break; - - case 95: - w5[3] = w5[3] | 0x80000000; - break; - - case 96: - w6[0] = w6[0] | 0x80; - break; - - case 97: - w6[0] = w6[0] | 0x8000; - break; - - case 98: - w6[0] = w6[0] | 0x800000; - break; - - case 99: - w6[0] = w6[0] | 0x80000000; - break; - - case 100: - w6[1] = w6[1] | 0x80; - break; - - case 101: - w6[1] = w6[1] | 0x8000; - break; - - case 102: - w6[1] = w6[1] | 0x800000; - break; - - case 103: - w6[1] = w6[1] | 0x80000000; - break; - - case 104: - w6[2] = w6[2] | 0x80; - break; - - case 105: - w6[2] = w6[2] | 0x8000; - break; - - case 106: - w6[2] = w6[2] | 0x800000; - break; - - case 107: - w6[2] = w6[2] | 0x80000000; - break; - - case 108: - w6[3] = w6[3] | 0x80; - break; - - case 109: - w6[3] = w6[3] | 0x8000; - break; - - case 110: - w6[3] = w6[3] | 0x800000; - break; - - case 111: - w6[3] = w6[3] | 0x80000000; - break; - - case 112: - w7[0] = w7[0] | 0x80; - break; - - case 113: - w7[0] = w7[0] | 0x8000; - break; - - case 114: - w7[0] = w7[0] | 0x800000; - break; - - case 115: - w7[0] = w7[0] | 0x80000000; - break; - - case 116: - w7[1] = w7[1] | 0x80; - break; - - case 117: - w7[1] = w7[1] | 0x8000; - break; - - case 118: - w7[1] = w7[1] | 0x800000; - break; - - case 119: - w7[1] = w7[1] | 0x80000000; - break; - - case 120: - w7[2] = w7[2] | 0x80; - break; - - case 121: - w7[2] = w7[2] | 0x8000; - break; - - case 122: - w7[2] = w7[2] | 0x800000; - break; - - case 123: - w7[2] = w7[2] | 0x80000000; - break; - - case 124: - w7[3] = w7[3] | 0x80; - break; - - case 125: - w7[3] = w7[3] | 0x8000; - break; - - case 126: - w7[3] = w7[3] | 0x800000; - break; - - case 127: - w7[3] = w7[3] | 0x80000000; - break; + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4_S (w0, 0x80808080, v); break; + case 1: append_helper_1x4_S (w1, 0x80808080, v); break; + case 2: append_helper_1x4_S (w2, 0x80808080, v); break; + case 3: append_helper_1x4_S (w3, 0x80808080, v); break; + case 4: append_helper_1x4_S (w4, 0x80808080, v); break; + case 5: append_helper_1x4_S (w5, 0x80808080, v); break; + case 6: append_helper_1x4_S (w6, 0x80808080, v); break; + case 7: append_helper_1x4_S (w7, 0x80808080, v); break; } } diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index e47e2ded3..c79009cf9 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -758,14 +758,19 @@ void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0 | value << 16 | value << 24; - buf0[0] |= tmp & c_append_helper[offset][0]; - buf0[1] |= tmp & c_append_helper[offset][1]; - buf0[2] |= tmp & c_append_helper[offset][2]; - buf0[3] |= tmp & c_append_helper[offset][3]; - buf1[0] |= tmp & c_append_helper[offset][4]; - buf1[1] |= tmp & c_append_helper[offset][5]; - buf1[2] |= tmp & c_append_helper[offset][6]; - buf1[3] |= tmp & c_append_helper[offset][7]; + const u32 v[4] = + { + c_append_helper_mini[offset & 0xf][0], + c_append_helper_mini[offset & 0xf][1], + c_append_helper_mini[offset & 0xf][2], + c_append_helper_mini[offset & 0xf][3] + }; + + switch (offset / 16) + { + case 0: append_helper_1x4 (buf0, tmp, v); break; + case 1: append_helper_1x4 (buf1, tmp, v); break; + } } void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) @@ -1295,14 +1300,30 @@ u32 rule_op_mangle_rotate_right (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u u32 tmp = 0; - tmp |= buf0[0] & c_append_helper[in_len1][0]; - tmp |= buf0[1] & c_append_helper[in_len1][1]; - tmp |= buf0[2] & c_append_helper[in_len1][2]; - tmp |= buf0[3] & c_append_helper[in_len1][3]; - tmp |= buf1[0] & c_append_helper[in_len1][4]; - tmp |= buf1[1] & c_append_helper[in_len1][5]; - tmp |= buf1[2] & c_append_helper[in_len1][6]; - tmp |= buf1[3] & c_append_helper[in_len1][7]; + const u32 v[4] = + { + c_append_helper_mini[in_len1 & 0xf][0], + c_append_helper_mini[in_len1 & 0xf][1], + c_append_helper_mini[in_len1 & 0xf][2], + c_append_helper_mini[in_len1 & 0xf][3] + }; + + switch (in_len1 / 16) + { + case 0: + tmp |= buf0[0] & v[0]; + tmp |= buf0[1] & v[1]; + tmp |= buf0[2] & v[2]; + tmp |= buf0[3] & v[3]; + break; + + case 1: + tmp |= buf1[0] & v[0]; + tmp |= buf1[1] & v[1]; + tmp |= buf1[2] & v[2]; + tmp |= buf1[3] & v[3]; + break; + } tmp = (tmp >> sh) & 0xff; @@ -1807,14 +1828,30 @@ u32 rule_op_mangle_dupechar_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 tmp = 0; - tmp |= buf0[0] & c_append_helper[in_len1][0]; - tmp |= buf0[1] & c_append_helper[in_len1][1]; - tmp |= buf0[2] & c_append_helper[in_len1][2]; - tmp |= buf0[3] & c_append_helper[in_len1][3]; - tmp |= buf1[0] & c_append_helper[in_len1][4]; - tmp |= buf1[1] & c_append_helper[in_len1][5]; - tmp |= buf1[2] & c_append_helper[in_len1][6]; - tmp |= buf1[3] & c_append_helper[in_len1][7]; + const u32 v[4] = + { + c_append_helper_mini[in_len1 & 0xf][0], + c_append_helper_mini[in_len1 & 0xf][1], + c_append_helper_mini[in_len1 & 0xf][2], + c_append_helper_mini[in_len1 & 0xf][3] + }; + + switch (in_len1 / 16) + { + case 0: + tmp |= buf0[0] & v[0]; + tmp |= buf0[1] & v[1]; + tmp |= buf0[2] & v[2]; + tmp |= buf0[3] & v[3]; + break; + + case 1: + tmp |= buf1[0] & v[0]; + tmp |= buf1[1] & v[1]; + tmp |= buf1[2] & v[2]; + tmp |= buf1[3] & v[3]; + break; + } tmp = (tmp >> sh) & 0xff;