Update hipDeviceAttribute_t for ROCm 6.x

Add hipDeviceProp_t and bindings for hipGetDeviceProperties(), hipGetDeviceProperties is required to retrieve gcnArchName[]. Add gcnArchName[] to select the correct --gpu-architecture value for a specific device when using hiprtc. Include sm_major and sm_minor for CUDA and gcnArchName for HIP in the kernel filename hash. Update nvrtc_options[] and hiprtc_options[] to avoid unused variables, eliminating the use of --restrict as a placeholder and preventing nvrtc from aborting. Add check_file_suffix() and remove_file_suffix() helper functions.
2025-07-24 07:28:37 +00:00 · 2025-06-18 18:29:47 +02:00 · 2025-06-18 18:29:47 +02:00 · c033873e4b
commit c033873e4b
parent 13245b5563
6 changed files with 335 additions and 90 deletions
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
@ -165,38 +165,43 @@ typedef enum __HIP_NODISCARD hipError_t {

 #undef __HIP_NODISCARD

+
+/**
+ * hipDeviceAttribute_t
+ * hipDeviceAttributeUnused number: 5
+ */
 typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeCudaCompatibleBegin = 0,

    hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled.
    hipDeviceAttributeAccessPolicyMaxWindowSize,        ///< Cuda only. The maximum size of the window policy in bytes.
-    hipDeviceAttributeAsyncEngineCount,                 ///< Cuda only. Asynchronous engines number.
+    hipDeviceAttributeAsyncEngineCount,                 ///< Asynchronous engines number.
    hipDeviceAttributeCanMapHostMemory,                 ///< Whether host memory can be mapped into device address space
-    hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory
+    hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Device can access host registered memory
                                                        ///< at the same virtual address as the CPU
    hipDeviceAttributeClockRate,                        ///< Peak clock frequency in kilohertz.
    hipDeviceAttributeComputeMode,                      ///< Compute mode that device is currently in.
-    hipDeviceAttributeComputePreemptionSupported,       ///< Cuda only. Device supports Compute Preemption.
+    hipDeviceAttributeComputePreemptionSupported,       ///< Device supports Compute Preemption.
    hipDeviceAttributeConcurrentKernels,                ///< Device can possibly execute multiple kernels concurrently.
    hipDeviceAttributeConcurrentManagedAccess,          ///< Device can coherently access managed memory concurrently with the CPU
    hipDeviceAttributeCooperativeLaunch,                ///< Support cooperative launch
    hipDeviceAttributeCooperativeMultiDeviceLaunch,     ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeDeviceOverlap,                    ///< Cuda only. Device can concurrently copy memory and execute a kernel.
+    hipDeviceAttributeDeviceOverlap,                    ///< Device can concurrently copy memory and execute a kernel.
                                                        ///< Deprecated. Use instead asyncEngineCount.
    hipDeviceAttributeDirectManagedMemAccessFromHost,   ///< Host can directly access managed memory on
                                                        ///< the device without migration
-    hipDeviceAttributeGlobalL1CacheSupported,           ///< Cuda only. Device supports caching globals in L1
-    hipDeviceAttributeHostNativeAtomicSupported,        ///< Cuda only. Link between the device and the host supports native atomic operations
+    hipDeviceAttributeGlobalL1CacheSupported,           ///< Device supports caching globals in L1
+    hipDeviceAttributeHostNativeAtomicSupported,        ///< Link between the device and the host supports native atomic operations
    hipDeviceAttributeIntegrated,                       ///< Device is integrated GPU
    hipDeviceAttributeIsMultiGpuBoard,                  ///< Multiple GPU devices.
    hipDeviceAttributeKernelExecTimeout,                ///< Run time limit for kernels executed on the device
    hipDeviceAttributeL2CacheSize,                      ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
    hipDeviceAttributeLocalL1CacheSupported,            ///< caching locals in L1 is supported
-    hipDeviceAttributeLuid,                             ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeLuidDeviceNodeMask,               ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms
+    hipDeviceAttributeLuid,                             ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+    hipDeviceAttributeLuidDeviceNodeMask,               ///< Luid device node mask. Undefined on TCC and non-Windows platforms
    hipDeviceAttributeComputeCapabilityMajor,           ///< Major compute capability version number.
    hipDeviceAttributeManagedMemory,                    ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Cuda only. Max block size per multiprocessor
+    hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Max block size per multiprocessor
    hipDeviceAttributeMaxBlockDimX,                     ///< Max block size in width.
    hipDeviceAttributeMaxBlockDimY,                     ///< Max block size in height.
    hipDeviceAttributeMaxBlockDimZ,                     ///< Max block size in depth.
@ -211,22 +216,22 @@ typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeMaxSurfaceCubemap,                ///< Cuda only. Maximum dimensions of Cubemap surface.
    hipDeviceAttributeMaxSurfaceCubemapLayered,         ///< Cuda only. Maximum dimension of Cubemap layered surface.
    hipDeviceAttributeMaxTexture1DWidth,                ///< Maximum size of 1D texture.
-    hipDeviceAttributeMaxTexture1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered texture.
+    hipDeviceAttributeMaxTexture1DLayered,              ///< Maximum dimensions of 1D layered texture.
    hipDeviceAttributeMaxTexture1DLinear,               ///< Maximum number of elements allocatable in a 1D linear texture.
                                                        ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
-    hipDeviceAttributeMaxTexture1DMipmap,               ///< Cuda only. Maximum size of 1D mipmapped texture.
+    hipDeviceAttributeMaxTexture1DMipmap,               ///< Maximum size of 1D mipmapped texture.
    hipDeviceAttributeMaxTexture2DWidth,                ///< Maximum dimension width of 2D texture.
-    hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension height of 2D texture.
-    hipDeviceAttributeMaxTexture2DGather,               ///< Cuda only. Maximum dimensions of 2D texture if gather operations  performed.
-    hipDeviceAttributeMaxTexture2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered texture.
-    hipDeviceAttributeMaxTexture2DLinear,               ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
-    hipDeviceAttributeMaxTexture2DMipmap,               ///< Cuda only. Maximum dimensions of 2D mipmapped texture.
+    hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension hight of 2D texture.
+    hipDeviceAttributeMaxTexture2DGather,               ///< Maximum dimensions of 2D texture if gather operations  performed.
+    hipDeviceAttributeMaxTexture2DLayered,              ///< Maximum dimensions of 2D layered texture.
+    hipDeviceAttributeMaxTexture2DLinear,               ///< Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
+    hipDeviceAttributeMaxTexture2DMipmap,               ///< Maximum dimensions of 2D mipmapped texture.
    hipDeviceAttributeMaxTexture3DWidth,                ///< Maximum dimension width of 3D texture.
    hipDeviceAttributeMaxTexture3DHeight,               ///< Maximum dimension height of 3D texture.
    hipDeviceAttributeMaxTexture3DDepth,                ///< Maximum dimension depth of 3D texture.
-    hipDeviceAttributeMaxTexture3DAlt,                  ///< Cuda only. Maximum dimensions of alternate 3D texture.
-    hipDeviceAttributeMaxTextureCubemap,                ///< Cuda only. Maximum dimensions of Cubemap texture
-    hipDeviceAttributeMaxTextureCubemapLayered,         ///< Cuda only. Maximum dimensions of Cubemap layered texture.
+    hipDeviceAttributeMaxTexture3DAlt,                  ///< Maximum dimensions of alternate 3D texture.
+    hipDeviceAttributeMaxTextureCubemap,                ///< Maximum dimensions of Cubemap texture
+    hipDeviceAttributeMaxTextureCubemapLayered,         ///< Maximum dimensions of Cubemap layered texture.
    hipDeviceAttributeMaxThreadsDim,                    ///< Maximum dimension of a block
    hipDeviceAttributeMaxThreadsPerBlock,               ///< Maximum number of threads per block.
    hipDeviceAttributeMaxThreadsPerMultiProcessor,      ///< Maximum resident threads per multiprocessor.
@ -234,45 +239,47 @@ typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeMemoryBusWidth,                   ///< Global memory bus width in bits.
    hipDeviceAttributeMemoryClockRate,                  ///< Peak memory clock frequency in kilohertz.
    hipDeviceAttributeComputeCapabilityMinor,           ///< Minor compute capability version number.
-    hipDeviceAttributeMultiGpuBoardGroupID,             ///< Cuda only. Unique ID of device group on the same multi-GPU board
+    hipDeviceAttributeMultiGpuBoardGroupID,             ///< Unique ID of device group on the same multi-GPU board
    hipDeviceAttributeMultiprocessorCount,              ///< Number of multiprocessors on the device.
-    hipDeviceAttributeName,                             ///< Device name.
+    hipDeviceAttributeUnused1,                          ///< Previously hipDeviceAttributeName
    hipDeviceAttributePageableMemoryAccess,             ///< Device supports coherently accessing pageable memory
                                                        ///< without calling hipHostRegister on it
    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables
    hipDeviceAttributePciBusId,                         ///< PCI Bus ID.
    hipDeviceAttributePciDeviceId,                      ///< PCI Device ID.
    hipDeviceAttributePciDomainID,                      ///< PCI Domain ID.
-    hipDeviceAttributePersistingL2CacheMaxSize,         ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes
+    hipDeviceAttributePersistingL2CacheMaxSize,         ///< Maximum l2 persisting lines capacity in bytes
    hipDeviceAttributeMaxRegistersPerBlock,             ///< 32-bit registers available to a thread block. This number is shared
                                                        ///< by all thread blocks simultaneously resident on a multiprocessor.
    hipDeviceAttributeMaxRegistersPerMultiprocessor,    ///< 32-bit registers available per block.
-    hipDeviceAttributeReservedSharedMemPerBlock,        ///< Cuda11 only. Shared memory reserved by CUDA driver per block.
+    hipDeviceAttributeReservedSharedMemPerBlock,        ///< Shared memory reserved by CUDA driver per block.
    hipDeviceAttributeMaxSharedMemoryPerBlock,          ///< Maximum shared memory available per block in bytes.
-    hipDeviceAttributeSharedMemPerBlockOptin,           ///< Cuda only. Maximum shared memory per block usable by special opt in.
-    hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Cuda only. Shared memory available per multiprocessor.
+    hipDeviceAttributeSharedMemPerBlockOptin,           ///< Maximum shared memory per block usable by special opt in.
+    hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Shared memory available per multiprocessor.
    hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision.
-    hipDeviceAttributeStreamPrioritiesSupported,        ///< Cuda only. Whether to support stream priorities.
-    hipDeviceAttributeSurfaceAlignment,                 ///< Cuda only. Alignment requirement for surfaces
+    hipDeviceAttributeStreamPrioritiesSupported,        ///< Whether to support stream priorities.
+    hipDeviceAttributeSurfaceAlignment,                 ///< Alignment requirement for surfaces
    hipDeviceAttributeTccDriver,                        ///< Cuda only. Whether device is a Tesla device using TCC driver
    hipDeviceAttributeTextureAlignment,                 ///< Alignment requirement for textures
    hipDeviceAttributeTexturePitchAlignment,            ///< Pitch alignment requirement for 2D texture references bound to pitched memory;
    hipDeviceAttributeTotalConstantMemory,              ///< Constant memory size in bytes.
-    hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on device.
+    hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on devicice.
    hipDeviceAttributeUnifiedAddressing,                ///< Cuda only. An unified address space shared with the host.
-    hipDeviceAttributeUuid,                             ///< Cuda only. Unique ID in 16 byte.
+    hipDeviceAttributeUnused2,                          ///< Previously hipDeviceAttributeUuid
    hipDeviceAttributeWarpSize,                         ///< Warp size in threads.
    hipDeviceAttributeMemoryPoolsSupported,             ///< Device supports HIP Stream Ordered Memory Allocator
    hipDeviceAttributeVirtualMemoryManagementSupported, ///< Device supports HIP virtual memory management
+    hipDeviceAttributeHostRegisterSupported,            ///< Can device support host memory registration via hipHostRegister
+    hipDeviceAttributeMemoryPoolSupportedHandleTypes,   ///< Supported handle mask for HIP Stream Ordered Memory Allocator

    hipDeviceAttributeCudaCompatibleEnd = 9999,
    hipDeviceAttributeAmdSpecificBegin = 10000,

    hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the device-side "clock*"
-    hipDeviceAttributeArch,                                     ///< Device architecture
+    hipDeviceAttributeUnused3,                                  ///< Previously hipDeviceAttributeArch
    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         ///< Maximum Shared Memory PerMultiprocessor.
-    hipDeviceAttributeGcnArch,                                  ///< Device gcn architecture
-    hipDeviceAttributeGcnArchName,                              ///< Device gcnArch name in 256 bytes
+    hipDeviceAttributeUnused4,                                  ///< Previously hipDeviceAttributeGcnArch
+    hipDeviceAttributeUnused5,                                  ///< Previously hipDeviceAttributeGcnArchName
    hipDeviceAttributeHdpMemFlushCntl,                          ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
    hipDeviceAttributeHdpRegFlushCntl,                          ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      ///< Supports cooperative launch on multiple
@ -298,6 +305,190 @@ typedef enum hipDeviceAttribute_t {
    // Extended attributes for vendors
 } hipDeviceAttribute_t;

+/**
+ * hipDeviceArch_t
+ *
+ */
+typedef struct {
+    // 32-bit Atomics
+    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
+    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
+    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
+    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
+    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.
+
+    // 64-bit Atomics
+    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
+    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+
+    // Doubles
+    unsigned hasDoubles : 1;  ///< Double-precision floating point.
+
+    // Warp cross-lane operations
+    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
+    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
+    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
+    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+
+    // Sync
+    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
+    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+
+    // Misc
+    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
+    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
+    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
+} hipDeviceArch_t;
+
+typedef struct hipUUID_t {
+    char bytes[16];
+} hipUUID;
+
+
+/**
+ * hipDeviceProp
+ *
+ */
+typedef struct hipDeviceProp_t {
+    char name[256];                   ///< Device name.
+    hipUUID uuid;                     ///< UUID of a device
+    char luid[8];                     ///< 8-byte unique identifier. Only valid on windows
+    unsigned int luidDeviceNodeMask;  ///< LUID node mask
+    size_t totalGlobalMem;            ///< Size of global memory region (in bytes).
+    size_t sharedMemPerBlock;         ///< Size of shared memory per block (in bytes).
+    int regsPerBlock;                 ///< Registers per block.
+    int warpSize;                     ///< Warp size.
+    size_t memPitch;                  ///< Maximum pitch in bytes allowed by memory copies
+                                      ///< pitched memory
+    int maxThreadsPerBlock;           ///< Max work items per work group or workgroup max size.
+    int maxThreadsDim[3];             ///< Max number of threads in each dimension (XYZ) of a block.
+    int maxGridSize[3];               ///< Max grid dimensions (XYZ).
+    int clockRate;                    ///< Max clock frequency of the multiProcessors in khz.
+    size_t totalConstMem;             ///< Size of shared constant memory region on the device
+                                      ///< (in bytes).
+    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    size_t textureAlignment;       ///< Alignment requirement for textures
+    size_t texturePitchAlignment;  ///< Pitch alignment requirement for texture references bound to
+    int deviceOverlap;             ///< Deprecated. Use asyncEngineCount instead
+    int multiProcessorCount;       ///< Number of multi-processors (compute units).
+    int kernelExecTimeoutEnabled;  ///< Run time limit for kernels executed on the device
+    int integrated;                ///< APU vs dGPU
+    int canMapHostMemory;          ///< Check whether HIP can map host memory
+    int computeMode;               ///< Compute mode.
+    int maxTexture1D;              ///< Maximum number of elements in 1D images
+    int maxTexture1DMipmap;        ///< Maximum 1D mipmap texture size
+    int maxTexture1DLinear;        ///< Maximum size for 1D textures bound to linear memory
+    int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
+    int maxTexture2DMipmap[2];  ///< Maximum number of elements in 2D array mipmap of images
+    int maxTexture2DLinear[3];  ///< Maximum 2D tex dimensions if tex are bound to pitched memory
+    int maxTexture2DGather[2];  ///< Maximum 2D tex dimensions if gather has to be performed
+    int maxTexture3D[3];  ///< Maximum dimensions (width, height, depth) of 3D images, in image
+                          ///< elements
+    int maxTexture3DAlt[3];           ///< Maximum alternate 3D texture dims
+    int maxTextureCubemap;            ///< Maximum cubemap texture dims
+    int maxTexture1DLayered[2];       ///< Maximum number of elements in 1D array images
+    int maxTexture2DLayered[3];       ///< Maximum number of elements in 2D array images
+    int maxTextureCubemapLayered[2];  ///< Maximum cubemaps layered texture dims
+    int maxSurface1D;                 ///< Maximum 1D surface size
+    int maxSurface2D[2];              ///< Maximum 2D surface size
+    int maxSurface3D[3];              ///< Maximum 3D surface size
+    int maxSurface1DLayered[2];       ///< Maximum 1D layered surface size
+    int maxSurface2DLayered[3];       ///< Maximum 2D layared surface size
+    int maxSurfaceCubemap;            ///< Maximum cubemap surface size
+    int maxSurfaceCubemapLayered[2];  ///< Maximum cubemap layered surface size
+    size_t surfaceAlignment;          ///< Alignment requirement for surface
+    int concurrentKernels;         ///< Device can possibly execute multiple kernels concurrently.
+    int ECCEnabled;                ///< Device has ECC support enabled
+    int pciBusID;                  ///< PCI Bus ID.
+    int pciDeviceID;               ///< PCI Device ID.
+    int pciDomainID;               ///< PCI Domain ID
+    int tccDriver;                 ///< 1:If device is Tesla device using TCC driver, else 0
+    int asyncEngineCount;          ///< Number of async engines
+    int unifiedAddressing;         ///< Does device and host share unified address space
+    int memoryClockRate;           ///< Max global memory clock frequency in khz.
+    int memoryBusWidth;            ///< Global memory bus width in bits.
+    int l2CacheSize;               ///< L2 cache size.
+    int persistingL2CacheMaxSize;  ///< Device's max L2 persisting lines in bytes
+    int maxThreadsPerMultiProcessor;    ///< Maximum resident threads per multi-processor.
+    int streamPrioritiesSupported;      ///< Device supports stream priority
+    int globalL1CacheSupported;         ///< Indicates globals are cached in L1
+    int localL1CacheSupported;          ///< Locals are cahced in L1
+    size_t sharedMemPerMultiprocessor;  ///< Amount of shared memory available per multiprocessor.
+    int regsPerMultiprocessor;          ///< registers available per multiprocessor
+    int managedMemory;         ///< Device supports allocating managed memory on this system
+    int isMultiGpuBoard;       ///< 1 if device is on a multi-GPU board, 0 if not.
+    int multiGpuBoardGroupID;  ///< Unique identifier for a group of devices on same multiboard GPU
+    int hostNativeAtomicSupported;         ///< Link between host and device supports native atomics
+    int singleToDoublePrecisionPerfRatio;  ///< Deprecated. CUDA only.
+    int pageableMemoryAccess;              ///< Device supports coherently accessing pageable memory
+                                           ///< without calling hipHostRegister on it
+    int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
+                                  ///< the CPU
+    int computePreemptionSupported;         ///< Is compute preemption supported on the device
+    int canUseHostPointerForRegisteredMem;  ///< Device can access host registered memory with same
+                                            ///< address as the host
+    int cooperativeLaunch;                  ///< HIP device supports cooperative launch
+    int cooperativeMultiDeviceLaunch;       ///< HIP device supports cooperative launch on multiple
+                                            ///< devices
+    size_t
+        sharedMemPerBlockOptin;  ///< Per device m ax shared mem per block usable by special opt in
+    int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
+                                                 ///< page tables
+    int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
+                                         ///< without migration
+    int maxBlocksPerMultiProcessor;      ///< Max number of blocks on CU
+    int accessPolicyMaxWindowSize;       ///< Max value of access policy window
+    size_t reservedSharedMemPerBlock;    ///< Shared memory reserved by driver per block
+    int hostRegisterSupported;           ///< Device supports hipHostRegister
+    int sparseHipArraySupported;         ///< Indicates if device supports sparse hip arrays
+    int hostRegisterReadOnlySupported;   ///< Device supports using the hipHostRegisterReadOnly flag
+                                         ///< with hipHostRegistger
+    int timelineSemaphoreInteropSupported;  ///< Indicates external timeline semaphore support
+    int memoryPoolsSupported;  ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
+    int gpuDirectRDMASupported;                    ///< Indicates device support of RDMA APIs
+    unsigned int gpuDirectRDMAFlushWritesOptions;  ///< Bitmask to be interpreted according to
+                                                   ///< hipFlushGPUDirectRDMAWritesOptions
+    int gpuDirectRDMAWritesOrdering;               ///< value of hipGPUDirectRDMAWritesOrdering
+    unsigned int
+        memoryPoolSupportedHandleTypes;  ///< Bitmask of handle types support with mempool based IPC
+    int deferredMappingHipArraySupported;  ///< Device supports deferred mapping HIP arrays and HIP
+                                           ///< mipmapped arrays
+    int ipcEventSupported;                 ///< Device supports IPC events
+    int clusterLaunch;                     ///< Device supports cluster launch
+    int unifiedFunctionPointers;           ///< Indicates device supports unified function pointers
+    int reserved[63];                      ///< CUDA Reserved.
+
+    int hipReserved[32];  ///< Reserved for adding new entries for HIP/CUDA.
+
+    /* HIP Only struct members */
+    char gcnArchName[256];                    ///< AMD GCN Arch Name. HIP Only.
+    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per CU. HIP Only.
+    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
+                               ///< instructions.  New for HIP.
+    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
+    unsigned int* hdpMemFlushCntl;            ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+    unsigned int* hdpRegFlushCntl;            ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+    int cooperativeMultiDeviceUnmatchedFunc;  ///< HIP device supports cooperative launch on
+                                              ///< multiple
+                                              /// devices with unmatched functions
+    int cooperativeMultiDeviceUnmatchedGridDim;    ///< HIP device supports cooperative launch on
+                                                   ///< multiple
+                                                   /// devices with unmatched grid dimensions
+    int cooperativeMultiDeviceUnmatchedBlockDim;   ///< HIP device supports cooperative launch on
+                                                   ///< multiple
+                                                   /// devices with unmatched block dimensions
+    int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
+                                                   ///< multiple
+                                                   /// devices with unmatched shared memories
+    int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
+    int asicRevision;                              ///< Revision of the GPU in this device
+} hipDeviceProp_t;
+
 //Flags that can be used with hipStreamCreateWithFlags.
 /** Default stream creation flags. These are used with hipStreamCreate().*/
 #define hipStreamDefault  0x00
@ -410,6 +601,7 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPRUNTIMEGETVERSION)      (int *);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMDESTROY)          (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE)      (hipStream_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPGETDEVICEPROPERTIES)    (hipDeviceProp_t *, hipDevice_t);

 typedef struct hc_hip_lib
 {
@ -453,6 +645,7 @@ typedef struct hc_hip_lib
  HIP_HIPSTREAMCREATE           hipStreamCreate;
  HIP_HIPSTREAMDESTROY          hipStreamDestroy;
  HIP_HIPSTREAMSYNCHRONIZE      hipStreamSynchronize;
+  HIP_HIPGETDEVICEPROPERTIES    hipGetDeviceProperties;

 } hc_hip_lib_t;

@ -498,5 +691,6 @@ int hc_hipRuntimeGetVersion    (void *hashcat_ctx, int *runtimeVersion);
 int hc_hipStreamCreate         (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags);
 int hc_hipStreamDestroy        (void *hashcat_ctx, hipStream_t hStream);
 int hc_hipStreamSynchronize    (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipGetDeviceProperties  (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);

 #endif // HC_EXT_HIP_H
--- a/include/shared.h
+++ b/include/shared.h
@ -114,4 +114,7 @@ bool is_apple_silicon (void);

 char *file_to_buffer (const char *filename);

+bool check_file_suffix (const char *file, const char *suffix);
+bool remove_file_suffix (char *file, const char *suffix);
+
 #endif // HC_SHARED_H
--- a/include/types.h
+++ b/include/types.h
@ -1235,6 +1235,7 @@ typedef struct hc_device_param

  int     sm_major;
  int     sm_minor;
+  char   *gcnArchName;
  u32     kernel_exec_timeout;

  u32     kernel_preferred_wgs_multiple;
--- a/src/backend.c
+++ b/src/backend.c
@ -408,7 +408,7 @@ static bool cuda_test_instruction (hashcat_ctx_t *hashcat_ctx, const int sm_majo
  nvrtc_options[0] = "--restrict";
  nvrtc_options[1] = "--gpu-architecture";

-  hc_asprintf (&nvrtc_options[2], "compute_%d%d", sm_major, sm_minor);
+  hc_asprintf (&nvrtc_options[2], "compute_%d", (device_param->sm_major * 10) + device_param->sm_minor);

  nvrtc_options[3] = NULL;

@ -5804,6 +5804,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

      device_param->hip_warp_size = hip_warp_size;

+      // gcnArchName
+
+      hipDeviceProp_t prop;
+
+      if (hc_hipGetDeviceProperties (hashcat_ctx, &prop, hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      device_param->gcnArchName = strdup (prop.gcnArchName);
+
      // sm_minor, sm_major

      int sm_major = 0;
@ -8962,61 +8975,47 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p

      if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false;

-      char **nvrtc_options = (char **) hccalloc (12 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+      char **nvrtc_options = (char **) hccalloc (16 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+
+      int nvrtc_options_idx = 0;

      if (backend_ctx->nvrtc_driver_version >= 12000)
      {
-        nvrtc_options[0] = "--std=c++14";
-      }
-      else
-      {
-        // some placeholder
-        nvrtc_options[0] = "--restrict";
+        nvrtc_options[nvrtc_options_idx++] = "--std=c++14";
      }

-      nvrtc_options[1] = "--restrict";
-      nvrtc_options[2] = "--device-as-default-execution-space";
-      nvrtc_options[3] = "--gpu-architecture";
+      nvrtc_options[nvrtc_options_idx++] = "--restrict";
+      nvrtc_options[nvrtc_options_idx++] = "--device-as-default-execution-space";
+      nvrtc_options[nvrtc_options_idx++] = "--gpu-architecture";

-      hc_asprintf (&nvrtc_options[4], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "compute_%d", (device_param->sm_major * 10) + device_param->sm_minor);

      if (backend_ctx->nvrtc_driver_version >= 12010)
      {
-        nvrtc_options[5] = "--split-compile";
+        nvrtc_options[nvrtc_options_idx++] = "--split-compile";

-        hc_asprintf (&nvrtc_options[6], "%d", 0);
-      }
-      else
-      {
-        // some placeholder
-        nvrtc_options[5] = "--restrict";
-        nvrtc_options[6] = "--restrict";
+        hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "%d", 0);
      }

      if (backend_ctx->nvrtc_driver_version >= 12040)
      {
-        nvrtc_options[7] = "--minimal";
-      }
-      else
-      {
-        // some placeholder
-        nvrtc_options[7] = "--restrict";
+        nvrtc_options[nvrtc_options_idx++] = "--minimal";
      }

      // untested on windows, but it should work
      #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
-      hc_asprintf (&nvrtc_options[8], "-D INCLUDE_PATH=%s", "OpenCL");
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D INCLUDE_PATH=%s", "OpenCL");
      #else
-      hc_asprintf (&nvrtc_options[8], "-D INCLUDE_PATH=%s", folder_config->cpath_real);
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D INCLUDE_PATH=%s", folder_config->cpath_real);
      #endif

-      hc_asprintf (&nvrtc_options[9], "-D XM2S(x)=#x");
-      hc_asprintf (&nvrtc_options[10], "-D M2S(x)=XM2S(x)");
-      hc_asprintf (&nvrtc_options[11], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D XM2S(x)=#x");
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D M2S(x)=XM2S(x)");
+      hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);

      char *nvrtc_options_string = hcstrdup (build_options_buf);

-      const int num_options = 12 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 12);
+      const int num_options = nvrtc_options_idx + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + nvrtc_options_idx);

      const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options);

@ -9238,42 +9237,27 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p

      if (hc_hiprtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false;

-      char **hiprtc_options = (char **) hccalloc (8 + strlen (build_options_buf) + 1, sizeof (char *)); // ...
+      char **hiprtc_options = (char **) hccalloc (16 + strlen (build_options_buf) + 1, sizeof (char *)); // ...

-      //hiprtc_options[0] = "--restrict";
-      //hiprtc_options[1] = "--device-as-default-execution-space";
-      //hiprtc_options[2] = "--gpu-architecture";
+      int hiprtc_options_idx = 0;

-      hc_asprintf (&hiprtc_options[0], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
-
-      /* 4.3 linux
-      hiprtc_options[1] = "-I";
-      hiprtc_options[2] = "/opt/rocm/hip/bin/include";
-      hiprtc_options[3] = "-I";
-      hiprtc_options[4] = "/opt/rocm/include";
-      hiprtc_options[5] = "-I";
-      */
-
-      hiprtc_options[1] = "";
-      hiprtc_options[2] = "";
-      hiprtc_options[3] = "";
-      hiprtc_options[4] = "";
-      hiprtc_options[5] = "";
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-architecture=%s", device_param->gcnArchName);

      // untested but it should work
      #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
-      hc_asprintf (&hiprtc_options[5], "-D INCLUDE_PATH=%s/OpenCL/", folder_config->cwd);
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D INCLUDE_PATH=%s/OpenCL/", folder_config->cwd);
      // ugly, but required since HIPRTC is changing the current working folder to the temporary compile folder
      #else
-      hc_asprintf (&hiprtc_options[5], "-D INCLUDE_PATH=%s", folder_config->cpath_real);
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D INCLUDE_PATH=%s", folder_config->cpath_real);
      #endif

-      hc_asprintf (&hiprtc_options[6], "-D XM2S(x)=#x");
-      hc_asprintf (&hiprtc_options[7], "-D M2S(x)=XM2S(x)");
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D XM2S(x)=#x");
+      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D M2S(x)=XM2S(x)");

      char *hiprtc_options_string = hcstrdup (build_options_buf);

-      const int num_options = 8 + hiprtc_make_options_array_from_string (hiprtc_options_string, hiprtc_options + 8);
+      const int num_options = hiprtc_options_idx + hiprtc_make_options_array_from_string (hiprtc_options_string, hiprtc_options + hiprtc_options_idx);

      const int rc_hiprtcCompileProgram = hc_hiprtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) hiprtc_options);

@ -10674,11 +10658,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

    char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 };

-    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%d-%u-%s-%s-%s-%u-%u",
+    const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%u-%u-%s-%d-%u-%s-%s-%s-%u-%u",
      backend_ctx->comptime,
      backend_ctx->cuda_driver_version,
      backend_ctx->hip_runtimeVersion,
      backend_ctx->metal_runtimeVersion,
+      device_param->sm_major,
+      device_param->sm_minor,
+      (device_param->is_hip == true) ? device_param->gcnArchName : "",
      device_param->is_opencl,
      device_param->opencl_platform_vendor_id,
      device_param->device_name,
@ -11237,11 +11224,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE;

-      const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%d-%u-%s-%s-%s-%d-%u-%u-%u-%u-%s",
+      const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%u-%u-%s-%d-%u-%s-%s-%s-%d-%u-%u-%u-%u-%s",
        backend_ctx->comptime,
        backend_ctx->cuda_driver_version,
        backend_ctx->hip_runtimeVersion,
        backend_ctx->metal_runtimeVersion,
+        device_param->sm_major,
+        device_param->sm_minor,
+        (device_param->is_hip == true) ? device_param->gcnArchName : "",
        device_param->is_opencl,
        device_param->opencl_platform_vendor_id,
        device_param->device_name,
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@ -154,6 +154,7 @@ int hip_init (void *hashcat_ctx)
  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,     HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);

  return 0;
 }
@ -1148,3 +1149,30 @@ int hc_hipStreamSynchronize (void *hashcat_ctx, hipStream_t hStream)

  return 0;
 }
+
+int hc_hipGetDeviceProperties (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipGetDeviceProperties (prop, dev);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
--- a/src/shared.c
+++ b/src/shared.c
@ -1553,3 +1553,32 @@ int extract_dynamicx_hash (const u8 *input_buf, const int input_len, u8 **output

  return hash_mode;
 }
+
+bool check_file_suffix (const char *file, const char *suffix)
+{
+  if (file == NULL)   return false;
+  if (suffix == NULL) return false;
+
+  const size_t len_file = strlen (file);
+  const size_t len_suffix = strlen (suffix);
+
+  if (len_suffix > len_file) return false;
+
+  return strcmp (file + len_file - len_suffix, suffix) == 0;
+}
+
+bool remove_file_suffix (char *file, const char *suffix)
+{
+  if (file == NULL)   return false;
+  if (suffix == NULL) return false;
+
+  if (check_file_suffix (file, suffix) == false) return false;
+
+  const size_t len_file = strlen (file);
+  const size_t len_suffix = strlen (suffix);
+
+  file[len_file - len_suffix] = 0;
+
+  return true;
+}
+