From c033873e4bd419e39df52dc308c95e74f14f145d Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 18 Jun 2025 18:29:47 +0200 Subject: [PATCH] Update hipDeviceAttribute_t for ROCm 6.x Add hipDeviceProp_t and bindings for hipGetDeviceProperties(), hipGetDeviceProperties is required to retrieve gcnArchName[]. Add gcnArchName[] to select the correct --gpu-architecture value for a specific device when using hiprtc. Include sm_major and sm_minor for CUDA and gcnArchName for HIP in the kernel filename hash. Update nvrtc_options[] and hiprtc_options[] to avoid unused variables, eliminating the use of --restrict as a placeholder and preventing nvrtc from aborting. Add check_file_suffix() and remove_file_suffix() helper functions. --- include/ext_hip.h | 258 ++++++++++++++++++++++++++++++++++++++++------ include/shared.h | 3 + include/types.h | 1 + src/backend.c | 106 +++++++++---------- src/ext_hip.c | 28 +++++ src/shared.c | 29 ++++++ 6 files changed, 335 insertions(+), 90 deletions(-) diff --git a/include/ext_hip.h b/include/ext_hip.h index eb0d52a59..b9dfcb4c8 100644 --- a/include/ext_hip.h +++ b/include/ext_hip.h @@ -165,38 +165,43 @@ typedef enum __HIP_NODISCARD hipError_t { #undef __HIP_NODISCARD + +/** + * hipDeviceAttribute_t + * hipDeviceAttributeUnused number: 5 + */ typedef enum hipDeviceAttribute_t { hipDeviceAttributeCudaCompatibleBegin = 0, hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled. hipDeviceAttributeAccessPolicyMaxWindowSize, ///< Cuda only. The maximum size of the window policy in bytes. - hipDeviceAttributeAsyncEngineCount, ///< Cuda only. Asynchronous engines number. + hipDeviceAttributeAsyncEngineCount, ///< Asynchronous engines number. hipDeviceAttributeCanMapHostMemory, ///< Whether host memory can be mapped into device address space - hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory + hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Device can access host registered memory ///< at the same virtual address as the CPU hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. - hipDeviceAttributeComputePreemptionSupported, ///< Cuda only. Device supports Compute Preemption. + hipDeviceAttributeComputePreemptionSupported, ///< Device supports Compute Preemption. hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory concurrently with the CPU hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices - hipDeviceAttributeDeviceOverlap, ///< Cuda only. Device can concurrently copy memory and execute a kernel. + hipDeviceAttributeDeviceOverlap, ///< Device can concurrently copy memory and execute a kernel. ///< Deprecated. Use instead asyncEngineCount. hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on ///< the device without migration - hipDeviceAttributeGlobalL1CacheSupported, ///< Cuda only. Device supports caching globals in L1 - hipDeviceAttributeHostNativeAtomicSupported, ///< Cuda only. Link between the device and the host supports native atomic operations + hipDeviceAttributeGlobalL1CacheSupported, ///< Device supports caching globals in L1 + hipDeviceAttributeHostNativeAtomicSupported, ///< Link between the device and the host supports native atomic operations hipDeviceAttributeIntegrated, ///< Device is integrated GPU hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. hipDeviceAttributeKernelExecTimeout, ///< Run time limit for kernels executed on the device hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. hipDeviceAttributeLocalL1CacheSupported, ///< caching locals in L1 is supported - hipDeviceAttributeLuid, ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms - hipDeviceAttributeLuidDeviceNodeMask, ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms + hipDeviceAttributeLuid, ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms + hipDeviceAttributeLuidDeviceNodeMask, ///< Luid device node mask. Undefined on TCC and non-Windows platforms hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system - hipDeviceAttributeMaxBlocksPerMultiProcessor, ///< Cuda only. Max block size per multiprocessor + hipDeviceAttributeMaxBlocksPerMultiProcessor, ///< Max block size per multiprocessor hipDeviceAttributeMaxBlockDimX, ///< Max block size in width. hipDeviceAttributeMaxBlockDimY, ///< Max block size in height. hipDeviceAttributeMaxBlockDimZ, ///< Max block size in depth. @@ -211,22 +216,22 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeMaxSurfaceCubemap, ///< Cuda only. Maximum dimensions of Cubemap surface. hipDeviceAttributeMaxSurfaceCubemapLayered, ///< Cuda only. Maximum dimension of Cubemap layered surface. hipDeviceAttributeMaxTexture1DWidth, ///< Maximum size of 1D texture. - hipDeviceAttributeMaxTexture1DLayered, ///< Cuda only. Maximum dimensions of 1D layered texture. + hipDeviceAttributeMaxTexture1DLayered, ///< Maximum dimensions of 1D layered texture. hipDeviceAttributeMaxTexture1DLinear, ///< Maximum number of elements allocatable in a 1D linear texture. ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda. - hipDeviceAttributeMaxTexture1DMipmap, ///< Cuda only. Maximum size of 1D mipmapped texture. + hipDeviceAttributeMaxTexture1DMipmap, ///< Maximum size of 1D mipmapped texture. hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D texture. - hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D texture. - hipDeviceAttributeMaxTexture2DGather, ///< Cuda only. Maximum dimensions of 2D texture if gather operations performed. - hipDeviceAttributeMaxTexture2DLayered, ///< Cuda only. Maximum dimensions of 2D layered texture. - hipDeviceAttributeMaxTexture2DLinear, ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory. - hipDeviceAttributeMaxTexture2DMipmap, ///< Cuda only. Maximum dimensions of 2D mipmapped texture. + hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension hight of 2D texture. + hipDeviceAttributeMaxTexture2DGather, ///< Maximum dimensions of 2D texture if gather operations performed. + hipDeviceAttributeMaxTexture2DLayered, ///< Maximum dimensions of 2D layered texture. + hipDeviceAttributeMaxTexture2DLinear, ///< Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory. + hipDeviceAttributeMaxTexture2DMipmap, ///< Maximum dimensions of 2D mipmapped texture. hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D texture. hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimension height of 3D texture. hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimension depth of 3D texture. - hipDeviceAttributeMaxTexture3DAlt, ///< Cuda only. Maximum dimensions of alternate 3D texture. - hipDeviceAttributeMaxTextureCubemap, ///< Cuda only. Maximum dimensions of Cubemap texture - hipDeviceAttributeMaxTextureCubemapLayered, ///< Cuda only. Maximum dimensions of Cubemap layered texture. + hipDeviceAttributeMaxTexture3DAlt, ///< Maximum dimensions of alternate 3D texture. + hipDeviceAttributeMaxTextureCubemap, ///< Maximum dimensions of Cubemap texture + hipDeviceAttributeMaxTextureCubemapLayered, ///< Maximum dimensions of Cubemap layered texture. hipDeviceAttributeMaxThreadsDim, ///< Maximum dimension of a block hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. @@ -234,45 +239,47 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. - hipDeviceAttributeMultiGpuBoardGroupID, ///< Cuda only. Unique ID of device group on the same multi-GPU board + hipDeviceAttributeMultiGpuBoardGroupID, ///< Unique ID of device group on the same multi-GPU board hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. - hipDeviceAttributeName, ///< Device name. + hipDeviceAttributeUnused1, ///< Previously hipDeviceAttributeName hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory ///< without calling hipHostRegister on it hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables hipDeviceAttributePciBusId, ///< PCI Bus ID. hipDeviceAttributePciDeviceId, ///< PCI Device ID. hipDeviceAttributePciDomainID, ///< PCI Domain ID. - hipDeviceAttributePersistingL2CacheMaxSize, ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes + hipDeviceAttributePersistingL2CacheMaxSize, ///< Maximum l2 persisting lines capacity in bytes hipDeviceAttributeMaxRegistersPerBlock, ///< 32-bit registers available to a thread block. This number is shared ///< by all thread blocks simultaneously resident on a multiprocessor. hipDeviceAttributeMaxRegistersPerMultiprocessor, ///< 32-bit registers available per block. - hipDeviceAttributeReservedSharedMemPerBlock, ///< Cuda11 only. Shared memory reserved by CUDA driver per block. + hipDeviceAttributeReservedSharedMemPerBlock, ///< Shared memory reserved by CUDA driver per block. hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. - hipDeviceAttributeSharedMemPerBlockOptin, ///< Cuda only. Maximum shared memory per block usable by special opt in. - hipDeviceAttributeSharedMemPerMultiprocessor, ///< Cuda only. Shared memory available per multiprocessor. + hipDeviceAttributeSharedMemPerBlockOptin, ///< Maximum shared memory per block usable by special opt in. + hipDeviceAttributeSharedMemPerMultiprocessor, ///< Shared memory available per multiprocessor. hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision. - hipDeviceAttributeStreamPrioritiesSupported, ///< Cuda only. Whether to support stream priorities. - hipDeviceAttributeSurfaceAlignment, ///< Cuda only. Alignment requirement for surfaces + hipDeviceAttributeStreamPrioritiesSupported, ///< Whether to support stream priorities. + hipDeviceAttributeSurfaceAlignment, ///< Alignment requirement for surfaces hipDeviceAttributeTccDriver, ///< Cuda only. Whether device is a Tesla device using TCC driver hipDeviceAttributeTextureAlignment, ///< Alignment requirement for textures hipDeviceAttributeTexturePitchAlignment, ///< Pitch alignment requirement for 2D texture references bound to pitched memory; hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. - hipDeviceAttributeTotalGlobalMem, ///< Global memory available on device. + hipDeviceAttributeTotalGlobalMem, ///< Global memory available on devicice. hipDeviceAttributeUnifiedAddressing, ///< Cuda only. An unified address space shared with the host. - hipDeviceAttributeUuid, ///< Cuda only. Unique ID in 16 byte. + hipDeviceAttributeUnused2, ///< Previously hipDeviceAttributeUuid hipDeviceAttributeWarpSize, ///< Warp size in threads. hipDeviceAttributeMemoryPoolsSupported, ///< Device supports HIP Stream Ordered Memory Allocator hipDeviceAttributeVirtualMemoryManagementSupported, ///< Device supports HIP virtual memory management + hipDeviceAttributeHostRegisterSupported, ///< Can device support host memory registration via hipHostRegister + hipDeviceAttributeMemoryPoolSupportedHandleTypes, ///< Supported handle mask for HIP Stream Ordered Memory Allocator hipDeviceAttributeCudaCompatibleEnd = 9999, hipDeviceAttributeAmdSpecificBegin = 10000, hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin, ///< Frequency in khz of the timer used by the device-side "clock*" - hipDeviceAttributeArch, ///< Device architecture + hipDeviceAttributeUnused3, ///< Previously hipDeviceAttributeArch hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory PerMultiprocessor. - hipDeviceAttributeGcnArch, ///< Device gcn architecture - hipDeviceAttributeGcnArchName, ///< Device gcnArch name in 256 bytes + hipDeviceAttributeUnused4, ///< Previously hipDeviceAttributeGcnArch + hipDeviceAttributeUnused5, ///< Previously hipDeviceAttributeGcnArchName hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, ///< Supports cooperative launch on multiple @@ -298,6 +305,190 @@ typedef enum hipDeviceAttribute_t { // Extended attributes for vendors } hipDeviceAttribute_t; +/** + * hipDeviceArch_t + * + */ +typedef struct { + // 32-bit Atomics + unsigned hasGlobalInt32Atomics : 1; ///< 32-bit integer atomics for global memory. + unsigned hasGlobalFloatAtomicExch : 1; ///< 32-bit float atomic exch for global memory. + unsigned hasSharedInt32Atomics : 1; ///< 32-bit integer atomics for shared memory. + unsigned hasSharedFloatAtomicExch : 1; ///< 32-bit float atomic exch for shared memory. + unsigned hasFloatAtomicAdd : 1; ///< 32-bit float atomic add in global and shared memory. + + // 64-bit Atomics + unsigned hasGlobalInt64Atomics : 1; ///< 64-bit integer atomics for global memory. + unsigned hasSharedInt64Atomics : 1; ///< 64-bit integer atomics for shared memory. + + // Doubles + unsigned hasDoubles : 1; ///< Double-precision floating point. + + // Warp cross-lane operations + unsigned hasWarpVote : 1; ///< Warp vote instructions (__any, __all). + unsigned hasWarpBallot : 1; ///< Warp ballot instructions (__ballot). + unsigned hasWarpShuffle : 1; ///< Warp shuffle operations. (__shfl_*). + unsigned hasFunnelShift : 1; ///< Funnel two words into one with shift&mask caps. + + // Sync + unsigned hasThreadFenceSystem : 1; ///< __threadfence_system. + unsigned hasSyncThreadsExt : 1; ///< __syncthreads_count, syncthreads_and, syncthreads_or. + + // Misc + unsigned hasSurfaceFuncs : 1; ///< Surface functions. + unsigned has3dGrid : 1; ///< Grid and group dims are 3D (rather than 2D). + unsigned hasDynamicParallelism : 1; ///< Dynamic parallelism. +} hipDeviceArch_t; + +typedef struct hipUUID_t { + char bytes[16]; +} hipUUID; + + +/** + * hipDeviceProp + * + */ +typedef struct hipDeviceProp_t { + char name[256]; ///< Device name. + hipUUID uuid; ///< UUID of a device + char luid[8]; ///< 8-byte unique identifier. Only valid on windows + unsigned int luidDeviceNodeMask; ///< LUID node mask + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory per block (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies + ///< pitched memory + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + size_t totalConstMem; ///< Size of shared constant memory region on the device + ///< (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + size_t textureAlignment; ///< Alignment requirement for textures + size_t texturePitchAlignment; ///< Pitch alignment requirement for texture references bound to + int deviceOverlap; ///< Deprecated. Use asyncEngineCount instead + int multiProcessorCount; ///< Number of multi-processors (compute units). + int kernelExecTimeoutEnabled; ///< Run time limit for kernels executed on the device + int integrated; ///< APU vs dGPU + int canMapHostMemory; ///< Check whether HIP can map host memory + int computeMode; ///< Compute mode. + int maxTexture1D; ///< Maximum number of elements in 1D images + int maxTexture1DMipmap; ///< Maximum 1D mipmap texture size + int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory + int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements + int maxTexture2DMipmap[2]; ///< Maximum number of elements in 2D array mipmap of images + int maxTexture2DLinear[3]; ///< Maximum 2D tex dimensions if tex are bound to pitched memory + int maxTexture2DGather[2]; ///< Maximum 2D tex dimensions if gather has to be performed + int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image + ///< elements + int maxTexture3DAlt[3]; ///< Maximum alternate 3D texture dims + int maxTextureCubemap; ///< Maximum cubemap texture dims + int maxTexture1DLayered[2]; ///< Maximum number of elements in 1D array images + int maxTexture2DLayered[3]; ///< Maximum number of elements in 2D array images + int maxTextureCubemapLayered[2]; ///< Maximum cubemaps layered texture dims + int maxSurface1D; ///< Maximum 1D surface size + int maxSurface2D[2]; ///< Maximum 2D surface size + int maxSurface3D[3]; ///< Maximum 3D surface size + int maxSurface1DLayered[2]; ///< Maximum 1D layered surface size + int maxSurface2DLayered[3]; ///< Maximum 2D layared surface size + int maxSurfaceCubemap; ///< Maximum cubemap surface size + int maxSurfaceCubemapLayered[2]; ///< Maximum cubemap layered surface size + size_t surfaceAlignment; ///< Alignment requirement for surface + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int ECCEnabled; ///< Device has ECC support enabled + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + int pciDomainID; ///< PCI Domain ID + int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0 + int asyncEngineCount; ///< Number of async engines + int unifiedAddressing; ///< Does device and host share unified address space + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. + int l2CacheSize; ///< L2 cache size. + int persistingL2CacheMaxSize; ///< Device's max L2 persisting lines in bytes + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int streamPrioritiesSupported; ///< Device supports stream priority + int globalL1CacheSupported; ///< Indicates globals are cached in L1 + int localL1CacheSupported; ///< Locals are cahced in L1 + size_t sharedMemPerMultiprocessor; ///< Amount of shared memory available per multiprocessor. + int regsPerMultiprocessor; ///< registers available per multiprocessor + int managedMemory; ///< Device supports allocating managed memory on this system + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. + int multiGpuBoardGroupID; ///< Unique identifier for a group of devices on same multiboard GPU + int hostNativeAtomicSupported; ///< Link between host and device supports native atomics + int singleToDoublePrecisionPerfRatio; ///< Deprecated. CUDA only. + int pageableMemoryAccess; ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + int concurrentManagedAccess; ///< Device can coherently access managed memory concurrently with + ///< the CPU + int computePreemptionSupported; ///< Is compute preemption supported on the device + int canUseHostPointerForRegisteredMem; ///< Device can access host registered memory with same + ///< address as the host + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple + ///< devices + size_t + sharedMemPerBlockOptin; ///< Per device m ax shared mem per block usable by special opt in + int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's + ///< page tables + int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device + ///< without migration + int maxBlocksPerMultiProcessor; ///< Max number of blocks on CU + int accessPolicyMaxWindowSize; ///< Max value of access policy window + size_t reservedSharedMemPerBlock; ///< Shared memory reserved by driver per block + int hostRegisterSupported; ///< Device supports hipHostRegister + int sparseHipArraySupported; ///< Indicates if device supports sparse hip arrays + int hostRegisterReadOnlySupported; ///< Device supports using the hipHostRegisterReadOnly flag + ///< with hipHostRegistger + int timelineSemaphoreInteropSupported; ///< Indicates external timeline semaphore support + int memoryPoolsSupported; ///< Indicates if device supports hipMallocAsync and hipMemPool APIs + int gpuDirectRDMASupported; ///< Indicates device support of RDMA APIs + unsigned int gpuDirectRDMAFlushWritesOptions; ///< Bitmask to be interpreted according to + ///< hipFlushGPUDirectRDMAWritesOptions + int gpuDirectRDMAWritesOrdering; ///< value of hipGPUDirectRDMAWritesOrdering + unsigned int + memoryPoolSupportedHandleTypes; ///< Bitmask of handle types support with mempool based IPC + int deferredMappingHipArraySupported; ///< Device supports deferred mapping HIP arrays and HIP + ///< mipmapped arrays + int ipcEventSupported; ///< Device supports IPC events + int clusterLaunch; ///< Device supports cluster launch + int unifiedFunctionPointers; ///< Indicates device supports unified function pointers + int reserved[63]; ///< CUDA Reserved. + + int hipReserved[32]; ///< Reserved for adding new entries for HIP/CUDA. + + /* HIP Only struct members */ + char gcnArchName[256]; ///< AMD GCN Arch Name. HIP Only. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per CU. HIP Only. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" + ///< instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + int cooperativeMultiDeviceUnmatchedFunc; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched functions + int cooperativeMultiDeviceUnmatchedGridDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched grid dimensions + int cooperativeMultiDeviceUnmatchedBlockDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched block dimensions + int cooperativeMultiDeviceUnmatchedSharedMem; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched shared memories + int isLargeBar; ///< 1: if it is a large PCI bar device, else 0 + int asicRevision; ///< Revision of the GPU in this device +} hipDeviceProp_t; + //Flags that can be used with hipStreamCreateWithFlags. /** Default stream creation flags. These are used with hipStreamCreate().*/ #define hipStreamDefault 0x00 @@ -410,6 +601,7 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPRUNTIMEGETVERSION) (int *); typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE) (hipStream_t *, unsigned int); typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMDESTROY) (hipStream_t); typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE) (hipStream_t); +typedef hipError_t (HIP_API_CALL *HIP_HIPGETDEVICEPROPERTIES) (hipDeviceProp_t *, hipDevice_t); typedef struct hc_hip_lib { @@ -453,6 +645,7 @@ typedef struct hc_hip_lib HIP_HIPSTREAMCREATE hipStreamCreate; HIP_HIPSTREAMDESTROY hipStreamDestroy; HIP_HIPSTREAMSYNCHRONIZE hipStreamSynchronize; + HIP_HIPGETDEVICEPROPERTIES hipGetDeviceProperties; } hc_hip_lib_t; @@ -498,5 +691,6 @@ int hc_hipRuntimeGetVersion (void *hashcat_ctx, int *runtimeVersion); int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags); int hc_hipStreamDestroy (void *hashcat_ctx, hipStream_t hStream); int hc_hipStreamSynchronize (void *hashcat_ctx, hipStream_t hStream); +int hc_hipGetDeviceProperties (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev); #endif // HC_EXT_HIP_H diff --git a/include/shared.h b/include/shared.h index bcc98e5d9..bc27b49e0 100644 --- a/include/shared.h +++ b/include/shared.h @@ -114,4 +114,7 @@ bool is_apple_silicon (void); char *file_to_buffer (const char *filename); +bool check_file_suffix (const char *file, const char *suffix); +bool remove_file_suffix (char *file, const char *suffix); + #endif // HC_SHARED_H diff --git a/include/types.h b/include/types.h index 8f265ab14..da6f12c62 100644 --- a/include/types.h +++ b/include/types.h @@ -1235,6 +1235,7 @@ typedef struct hc_device_param int sm_major; int sm_minor; + char *gcnArchName; u32 kernel_exec_timeout; u32 kernel_preferred_wgs_multiple; diff --git a/src/backend.c b/src/backend.c index 057d27252..c69792622 100644 --- a/src/backend.c +++ b/src/backend.c @@ -408,7 +408,7 @@ static bool cuda_test_instruction (hashcat_ctx_t *hashcat_ctx, const int sm_majo nvrtc_options[0] = "--restrict"; nvrtc_options[1] = "--gpu-architecture"; - hc_asprintf (&nvrtc_options[2], "compute_%d%d", sm_major, sm_minor); + hc_asprintf (&nvrtc_options[2], "compute_%d", (device_param->sm_major * 10) + device_param->sm_minor); nvrtc_options[3] = NULL; @@ -5804,6 +5804,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->hip_warp_size = hip_warp_size; + // gcnArchName + + hipDeviceProp_t prop; + + if (hc_hipGetDeviceProperties (hashcat_ctx, &prop, hip_device) == -1) + { + device_param->skipped = true; + + continue; + } + + device_param->gcnArchName = strdup (prop.gcnArchName); + // sm_minor, sm_major int sm_major = 0; @@ -8962,61 +8975,47 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p if (hc_nvrtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false; - char **nvrtc_options = (char **) hccalloc (12 + strlen (build_options_buf) + 1, sizeof (char *)); // ... + char **nvrtc_options = (char **) hccalloc (16 + strlen (build_options_buf) + 1, sizeof (char *)); // ... + + int nvrtc_options_idx = 0; if (backend_ctx->nvrtc_driver_version >= 12000) { - nvrtc_options[0] = "--std=c++14"; - } - else - { - // some placeholder - nvrtc_options[0] = "--restrict"; + nvrtc_options[nvrtc_options_idx++] = "--std=c++14"; } - nvrtc_options[1] = "--restrict"; - nvrtc_options[2] = "--device-as-default-execution-space"; - nvrtc_options[3] = "--gpu-architecture"; + nvrtc_options[nvrtc_options_idx++] = "--restrict"; + nvrtc_options[nvrtc_options_idx++] = "--device-as-default-execution-space"; + nvrtc_options[nvrtc_options_idx++] = "--gpu-architecture"; - hc_asprintf (&nvrtc_options[4], "compute_%d%d", device_param->sm_major, device_param->sm_minor); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "compute_%d", (device_param->sm_major * 10) + device_param->sm_minor); if (backend_ctx->nvrtc_driver_version >= 12010) { - nvrtc_options[5] = "--split-compile"; + nvrtc_options[nvrtc_options_idx++] = "--split-compile"; - hc_asprintf (&nvrtc_options[6], "%d", 0); - } - else - { - // some placeholder - nvrtc_options[5] = "--restrict"; - nvrtc_options[6] = "--restrict"; + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "%d", 0); } if (backend_ctx->nvrtc_driver_version >= 12040) { - nvrtc_options[7] = "--minimal"; - } - else - { - // some placeholder - nvrtc_options[7] = "--restrict"; + nvrtc_options[nvrtc_options_idx++] = "--minimal"; } // untested on windows, but it should work #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__) - hc_asprintf (&nvrtc_options[8], "-D INCLUDE_PATH=%s", "OpenCL"); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D INCLUDE_PATH=%s", "OpenCL"); #else - hc_asprintf (&nvrtc_options[8], "-D INCLUDE_PATH=%s", folder_config->cpath_real); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D INCLUDE_PATH=%s", folder_config->cpath_real); #endif - hc_asprintf (&nvrtc_options[9], "-D XM2S(x)=#x"); - hc_asprintf (&nvrtc_options[10], "-D M2S(x)=XM2S(x)"); - hc_asprintf (&nvrtc_options[11], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D XM2S(x)=#x"); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D M2S(x)=XM2S(x)"); + hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); char *nvrtc_options_string = hcstrdup (build_options_buf); - const int num_options = 12 + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + 12); + const int num_options = nvrtc_options_idx + nvrtc_make_options_array_from_string (nvrtc_options_string, nvrtc_options + nvrtc_options_idx); const int rc_nvrtcCompileProgram = hc_nvrtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) nvrtc_options); @@ -9238,42 +9237,27 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p if (hc_hiprtcCreateProgram (hashcat_ctx, &program, kernel_sources[0], kernel_name, 0, NULL, NULL) == -1) return false; - char **hiprtc_options = (char **) hccalloc (8 + strlen (build_options_buf) + 1, sizeof (char *)); // ... + char **hiprtc_options = (char **) hccalloc (16 + strlen (build_options_buf) + 1, sizeof (char *)); // ... - //hiprtc_options[0] = "--restrict"; - //hiprtc_options[1] = "--device-as-default-execution-space"; - //hiprtc_options[2] = "--gpu-architecture"; + int hiprtc_options_idx = 0; - hc_asprintf (&hiprtc_options[0], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); - - /* 4.3 linux - hiprtc_options[1] = "-I"; - hiprtc_options[2] = "/opt/rocm/hip/bin/include"; - hiprtc_options[3] = "-I"; - hiprtc_options[4] = "/opt/rocm/include"; - hiprtc_options[5] = "-I"; - */ - - hiprtc_options[1] = ""; - hiprtc_options[2] = ""; - hiprtc_options[3] = ""; - hiprtc_options[4] = ""; - hiprtc_options[5] = ""; + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-architecture=%s", device_param->gcnArchName); // untested but it should work #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__) - hc_asprintf (&hiprtc_options[5], "-D INCLUDE_PATH=%s/OpenCL/", folder_config->cwd); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D INCLUDE_PATH=%s/OpenCL/", folder_config->cwd); // ugly, but required since HIPRTC is changing the current working folder to the temporary compile folder #else - hc_asprintf (&hiprtc_options[5], "-D INCLUDE_PATH=%s", folder_config->cpath_real); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D INCLUDE_PATH=%s", folder_config->cpath_real); #endif - hc_asprintf (&hiprtc_options[6], "-D XM2S(x)=#x"); - hc_asprintf (&hiprtc_options[7], "-D M2S(x)=XM2S(x)"); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D XM2S(x)=#x"); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D M2S(x)=XM2S(x)"); char *hiprtc_options_string = hcstrdup (build_options_buf); - const int num_options = 8 + hiprtc_make_options_array_from_string (hiprtc_options_string, hiprtc_options + 8); + const int num_options = hiprtc_options_idx + hiprtc_make_options_array_from_string (hiprtc_options_string, hiprtc_options + hiprtc_options_idx); const int rc_hiprtcCompileProgram = hc_hiprtcCompileProgram (hashcat_ctx, program, num_options, (const char * const *) hiprtc_options); @@ -10674,11 +10658,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 }; - const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%d-%u-%s-%s-%s-%u-%u", + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%u-%u-%u-%s-%d-%u-%s-%s-%s-%u-%u", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_runtimeVersion, backend_ctx->metal_runtimeVersion, + device_param->sm_major, + device_param->sm_minor, + (device_param->is_hip == true) ? device_param->gcnArchName : "", device_param->is_opencl, device_param->opencl_platform_vendor_id, device_param->device_name, @@ -11237,11 +11224,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE; - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%d-%u-%s-%s-%s-%d-%u-%u-%u-%u-%s", + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%u-%u-%u-%s-%d-%u-%s-%s-%s-%d-%u-%u-%u-%u-%s", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_runtimeVersion, backend_ctx->metal_runtimeVersion, + device_param->sm_major, + device_param->sm_minor, + (device_param->is_hip == true) ? device_param->gcnArchName : "", device_param->is_opencl, device_param->opencl_platform_vendor_id, device_param->device_name, diff --git a/src/ext_hip.c b/src/ext_hip.c index 0611733aa..75b061037 100644 --- a/src/ext_hip.c +++ b/src/ext_hip.c @@ -154,6 +154,7 @@ int hip_init (void *hashcat_ctx) HC_LOAD_FUNC_HIP (hip, hipStreamCreate, hipStreamCreate, HIP_HIPSTREAMCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamDestroy, hipStreamDestroy, HIP_HIPSTREAMDESTROY, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize, hipStreamSynchronize, HIP_HIPSTREAMSYNCHRONIZE, HIP, 1); + HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties, hipGetDevicePropertiesR0600, HIP_HIPGETDEVICEPROPERTIES, HIP, 1); return 0; } @@ -1148,3 +1149,30 @@ int hc_hipStreamSynchronize (void *hashcat_ctx, hipStream_t hStream) return 0; } + +int hc_hipGetDeviceProperties (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev) +{ + backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx; + + HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip; + + const hipError_t HIP_err = hip->hipGetDeviceProperties (prop, dev); + + if (HIP_err != hipSuccess) + { + const char *pStr = NULL; + + if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess) + { + event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "hipDeviceGetAttribute(): %d", HIP_err); + } + + return -1; + } + + return 0; +} diff --git a/src/shared.c b/src/shared.c index 47645d655..4661c6934 100644 --- a/src/shared.c +++ b/src/shared.c @@ -1553,3 +1553,32 @@ int extract_dynamicx_hash (const u8 *input_buf, const int input_len, u8 **output return hash_mode; } + +bool check_file_suffix (const char *file, const char *suffix) +{ + if (file == NULL) return false; + if (suffix == NULL) return false; + + const size_t len_file = strlen (file); + const size_t len_suffix = strlen (suffix); + + if (len_suffix > len_file) return false; + + return strcmp (file + len_file - len_suffix, suffix) == 0; +} + +bool remove_file_suffix (char *file, const char *suffix) +{ + if (file == NULL) return false; + if (suffix == NULL) return false; + + if (check_file_suffix (file, suffix) == false) return false; + + const size_t len_file = strlen (file); + const size_t len_suffix = strlen (suffix); + + file[len_file - len_suffix] = 0; + + return true; +} +