diff --git a/include/ext_hip.h b/include/ext_hip.h index c29309ef3..7d9095588 100644 --- a/include/ext_hip.h +++ b/include/ext_hip.h @@ -161,129 +161,80 @@ typedef enum __HIP_NODISCARD hipError_t { #undef __HIP_NODISCARD typedef enum hipDeviceAttribute_t { - hipDeviceAttributeCudaCompatibleBegin = 0, + hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. + hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. + hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. + hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. + hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. + hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. + hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. + hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in + ///< bytes. + hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. + hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a + ///< thread block. This number is shared by all thread + ///< blocks simultaneously resident on a + ///< multiprocessor. + hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. + hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. + hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. + hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. + hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 + ///< cache. + hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per + ///< multiprocessor. + hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. + hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels + ///< concurrently. + hipDeviceAttributePciBusId, ///< PCI Bus ID. + hipDeviceAttributePciDeviceId, ///< PCI Device ID. + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per + ///< Multiprocessor. + hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. + hipDeviceAttributeIntegrated, ///< iGPU + hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch + hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices + hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images + hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements + hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements + hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D images in image elements + hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimensions height of 3D images in image elements + hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimensions depth of 3D images in image elements - hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled. - hipDeviceAttributeAccessPolicyMaxWindowSize, ///< Cuda only. The maximum size of the window policy in bytes. - hipDeviceAttributeAsyncEngineCount, ///< Cuda only. Asynchronous engines number. - hipDeviceAttributeCanMapHostMemory, ///< Whether host memory can be mapped into device address space - hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory - ///< at the same virtual address as the CPU - hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. - hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. - hipDeviceAttributeComputePreemptionSupported, ///< Cuda only. Device supports Compute Preemption. - hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. - hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory concurrently with the CPU - hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch - hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices - hipDeviceAttributeDeviceOverlap, ///< Cuda only. Device can concurrently copy memory and execute a kernel. - ///< Deprecated. Use instead asyncEngineCount. - hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on - ///< the device without migration - hipDeviceAttributeGlobalL1CacheSupported, ///< Cuda only. Device supports caching globals in L1 - hipDeviceAttributeHostNativeAtomicSupported, ///< Cuda only. Link between the device and the host supports native atomic operations - hipDeviceAttributeIntegrated, ///< Device is integrated GPU - hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. - hipDeviceAttributeKernelExecTimeout, ///< Run time limit for kernels executed on the device - hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. - hipDeviceAttributeLocalL1CacheSupported, ///< caching locals in L1 is supported - hipDeviceAttributeLuid, ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms - hipDeviceAttributeLuidDeviceNodeMask, ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms - hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. - hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system - hipDeviceAttributeMaxBlocksPerMultiProcessor, ///< Cuda only. Max block size per multiprocessor - hipDeviceAttributeMaxBlockDimX, ///< Max block size in width. - hipDeviceAttributeMaxBlockDimY, ///< Max block size in height. - hipDeviceAttributeMaxBlockDimZ, ///< Max block size in depth. - hipDeviceAttributeMaxGridDimX, ///< Max grid size in width. - hipDeviceAttributeMaxGridDimY, ///< Max grid size in height. - hipDeviceAttributeMaxGridDimZ, ///< Max grid size in depth. - hipDeviceAttributeMaxSurface1D, ///< Maximum size of 1D surface. - hipDeviceAttributeMaxSurface1DLayered, ///< Cuda only. Maximum dimensions of 1D layered surface. - hipDeviceAttributeMaxSurface2D, ///< Maximum dimension (width, height) of 2D surface. - hipDeviceAttributeMaxSurface2DLayered, ///< Cuda only. Maximum dimensions of 2D layered surface. - hipDeviceAttributeMaxSurface3D, ///< Maximum dimension (width, height, depth) of 3D surface. - hipDeviceAttributeMaxSurfaceCubemap, ///< Cuda only. Maximum dimensions of Cubemap surface. - hipDeviceAttributeMaxSurfaceCubemapLayered, ///< Cuda only. Maximum dimension of Cubemap layered surface. - hipDeviceAttributeMaxTexture1DWidth, ///< Maximum size of 1D texture. - hipDeviceAttributeMaxTexture1DLayered, ///< Cuda only. Maximum dimensions of 1D layered texture. - hipDeviceAttributeMaxTexture1DLinear, ///< Maximum number of elements allocatable in a 1D linear texture. - ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda. - hipDeviceAttributeMaxTexture1DMipmap, ///< Cuda only. Maximum size of 1D mipmapped texture. - hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D texture. - hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension hight of 2D texture. - hipDeviceAttributeMaxTexture2DGather, ///< Cuda only. Maximum dimensions of 2D texture if gather operations performed. - hipDeviceAttributeMaxTexture2DLayered, ///< Cuda only. Maximum dimensions of 2D layered texture. - hipDeviceAttributeMaxTexture2DLinear, ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory. - hipDeviceAttributeMaxTexture2DMipmap, ///< Cuda only. Maximum dimensions of 2D mipmapped texture. - hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D texture. - hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimension height of 3D texture. - hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimension depth of 3D texture. - hipDeviceAttributeMaxTexture3DAlt, ///< Cuda only. Maximum dimensions of alternate 3D texture. - hipDeviceAttributeMaxTextureCubemap, ///< Cuda only. Maximum dimensions of Cubemap texture - hipDeviceAttributeMaxTextureCubemapLayered, ///< Cuda only. Maximum dimensions of Cubemap layered texture. - hipDeviceAttributeMaxThreadsDim, ///< Maximum dimension of a block - hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. - hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. - hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies - hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. - hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. - hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. - hipDeviceAttributeMultiGpuBoardGroupID, ///< Cuda only. Unique ID of device group on the same multi-GPU board - hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. - hipDeviceAttributeName, ///< Device name. - hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory - ///< without calling hipHostRegister on it - hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables - hipDeviceAttributePciBusId, ///< PCI Bus ID. - hipDeviceAttributePciDeviceId, ///< PCI Device ID. - hipDeviceAttributePciDomainID, ///< PCI Domain ID. - hipDeviceAttributePersistingL2CacheMaxSize, ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes - hipDeviceAttributeMaxRegistersPerBlock, ///< 32-bit registers available to a thread block. This number is shared - ///< by all thread blocks simultaneously resident on a multiprocessor. - hipDeviceAttributeMaxRegistersPerMultiprocessor, ///< 32-bit registers available per block. - hipDeviceAttributeReservedSharedMemPerBlock, ///< Cuda11 only. Shared memory reserved by CUDA driver per block. - hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. - hipDeviceAttributeSharedMemPerBlockOptin, ///< Cuda only. Maximum shared memory per block usable by special opt in. - hipDeviceAttributeSharedMemPerMultiprocessor, ///< Cuda only. Shared memory available per multiprocessor. - hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision. - hipDeviceAttributeStreamPrioritiesSupported, ///< Cuda only. Whether to support stream priorities. - hipDeviceAttributeSurfaceAlignment, ///< Cuda only. Alignment requirement for surfaces - hipDeviceAttributeTccDriver, ///< Cuda only. Whether device is a Tesla device using TCC driver - hipDeviceAttributeTextureAlignment, ///< Alignment requirement for textures - hipDeviceAttributeTexturePitchAlignment, ///< Pitch alignment requirement for 2D texture references bound to pitched memory; - hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. - hipDeviceAttributeTotalGlobalMem, ///< Global memory available on devicice. - hipDeviceAttributeUnifiedAddressing, ///< Cuda only. An unified address space shared with the host. - hipDeviceAttributeUuid, ///< Cuda only. Unique ID in 16 byte. - hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register - hipDeviceAttributeCudaCompatibleEnd = 9999, - hipDeviceAttributeAmdSpecificBegin = 10000, + hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies + hipDeviceAttributeTextureAlignment, ///is_hip == true) + { + const u32 native_threads = device_param->kernel_preferred_wgs_multiple; + + if ((native_threads >= device_param->kernel_threads_min) && (native_threads <= device_param->kernel_threads_max)) + { + device_param->kernel_threads_min = native_threads; + device_param->kernel_threads_max = native_threads; + } + else + { + // abort? + } + } + if (hashconfig->opts_type & OPTS_TYPE_NATIVE_THREADS) { u32 native_threads = 0;