![]() |
GPUart
1.01
Prototype of GPUart, an application-based limited preemptive GPU scheduler for embedded real-time systems
|
Implementation of the management component of the GPUart Implemenation layer. More...
#include "GPUart_Impl.cuh"#include "GPUart_Impl.h"#include "GPUart_Impl_Abstr_IF.h"#include "GPUart_Impl_Sched_IF.h"#include "GPUart_Sobel.cuh"#include "GPUart_MatrMul.cuh"
Classes | |
| struct | device_global_memory_s |
| Typedef for a struct which combines global memory pointers, their related host pointers, and the size of the memory buffer. More... | |
| struct | device_constant_memory_s |
| Typedef for a struct which combines constant memory pointers and the size of the related memory buffer. More... | |
Macros | |
| #define | S_USE_ZERO_COPY_FOR_GLOBAL_APPLICATION_MEMORY |
| Use zero copy memory (requires integrated GPU) More... | |
| #define | C_PERSISTENT_KERNEL_EVENT_QUEUE_LENGTH (10) |
| The length of the Event Queue, shared between GPU and CPU, used for kernel launch events. More... | |
| #define | C_PERSISTENT_KERNEL_TERMINATE (-1) |
| Event ID to indicate a termination request for the persistent kernel. More... | |
Typedefs | |
| typedef cudaStream_t | command_queue_s |
| Typedef for command queues (streams) to abstract GPGPU-API. More... | |
Implementation of the management component of the GPUart Implemenation layer.
This file concentrates all GPGPU related memory declarations and allocations, memory transfers operations, kernel launches, kernel initialisations, and GPU related implementation details.
| #define C_PERSISTENT_KERNEL_EVENT_QUEUE_LENGTH (10) |
The length of the Event Queue, shared between GPU and CPU, used for kernel launch events.
| #define C_PERSISTENT_KERNEL_TERMINATE (-1) |
Event ID to indicate a termination request for the persistent kernel.
| #define S_USE_ZERO_COPY_FOR_GLOBAL_APPLICATION_MEMORY |
Use zero copy memory (requires integrated GPU)
This MUST be defined so far, since memory transfers over PCIe are currently not implemented completely.
| typedef cudaStream_t command_queue_s |
Typedef for command queues (streams) to abstract GPGPU-API.
Command queues are required to improve the concurrency of memory and kernel operatation on the GPU.
| __global__ void GPUart_Persistent_Kernel | ( | uint32 *__restrict__ | perKer_isRunning_u32_g, |
| uint32 *__restrict__ | perKer_eventQueueCntDevice_u32_g, | ||
| volatile uint32 *__restrict__ | perKer_eventQueueCntHost_u32_g, | ||
| volatile sint32 *__restrict__ | perKer_eventQueue_s32_g, | ||
| volatile uint32 *__restrict__ | perKer_kernelTasksRunningStates_u32_g, | ||
| sint32 *__restrict__ | sob1_matrix_in_s32_g, | ||
| sint32 *__restrict__ | sob1_matrix_out_s32_g, | ||
| sint32 *__restrict__ | sob2_matrix_in_s32_g, | ||
| sint32 *__restrict__ | sob2_matrix_out_s32_g, | ||
| float32 *__restrict__ | mm_matrix_A_f32_g, | ||
| float32 *__restrict__ | mm_matrix_B_f32_g, | ||
| float32 *__restrict__ | mm_matrix_C_f32_g, | ||
| uint32 *__restrict__ | sync_SOB1_flags_in_u32_g, | ||
| uint32 *__restrict__ | sync_SOB1_flags_out_u32_g, | ||
| uint32 *__restrict__ | sync_SOB2_flags_in_u32_g, | ||
| uint32 *__restrict__ | sync_SOB2_flags_out_u32_g, | ||
| uint32 *__restrict__ | sync_MM_flags_in_u32_g, | ||
| uint32 *__restrict__ | sync_MM_flags_out_u32_g, | ||
| sint32 *__restrict__ | preempt_SOB1_flag_g, | ||
| sint32 *__restrict__ | preempt_SOB1_flag_internal_g, | ||
| sint32 *__restrict__ | preempt_SOB1_sm_g, | ||
| sint32 *__restrict__ | preempt_SOB2_flag_g, | ||
| sint32 *__restrict__ | preempt_SOB2_flag_internal_g, | ||
| sint32 *__restrict__ | preempt_SOB2_sm_g, | ||
| sint32 *__restrict__ | preempt_MM_flag_g, | ||
| sint32 *__restrict__ | preempt_MM_sm_g, | ||
| uint32 *__restrict__ | sob1_buffer_loop_counter_u32_g, | ||
| uint32 *__restrict__ | sob2_buffer_loop_counter_u32_g, | ||
| uint32 *__restrict__ | mm_buffer_blockY_g, | ||
| uint32 *__restrict__ | mm_buffer_blockX_g, | ||
| uint32 *__restrict__ | mm_buffer_M_g | ||
| ) |
The persistent kernel (GPU Daemon) which is used to reduce kernel launch latencies.
The kernel arguments must include all global memory buffers of all kernels in this system, since this kernel is used to launch GPGPU kernel on demand. The persistent kernel reduces kernel launch latencies by bypassing the GPGPU driver stack when launching kernels.
| GPUart_Retval gpuI_destroy | ( | void | ) |

| GPUart_Retval gpuI_get_NrOfMultiprocessors | ( | uint32 * | nrOfMultprocessors, |
| uint32 | resourceFactor | ||
| ) |

| uint32 gpuI_getJobCosts | ( | kernel_task_id_e | task_id_e | ) |

| GPUart_Retval gpuI_init | ( | void | ) |

| GPUart_Retval gpuI_memcpyConstantMemory | ( | void * | variable_p, |
| device_constant_memory_id_e | id_p | ||
| ) |
Copy data from host memory to constant device memory.
The copy is only possible if persistent GPUart kernel GPUart_Persistent_Kernel is not running, since a constant memory data is immutable during kernel execution and its value is inherited from parent to child kernel.
| [in] | void | * variable_p -> The host variable to be copied |
| [in] | device_constant_memory_id_e | id_p -> The ID of the constant memory buffer |
| GPUart_Retval gpuI_memcpyDevice2Host | ( | void * | variable_p, |
| device_global_memory_id_e | id_p | ||
| ) |
Copy data from device memory to host memory.
Device memory may be shared physical memory or discrete device memory. The device driver API call may depend on the type of device memory (global or texture memory).
| [out] | void | * variable_p -> The host variable to be written |
| [in] | device_global_memory_id_e | id_p -> The ID of the global memory variable |

| GPUart_Retval gpuI_memcpyHost2Device | ( | void * | variable_p, |
| device_global_memory_id_e | id_p | ||
| ) |
Copy data from host memory to device memory.
Device memory may be shared physical memory or discrete device memory. The device driver API call may depend on the type of device memory (global or texture memory).
| [in] | void | * variable_p -> The host variable to be copied |
| [in] | device_global_memory_id_e | id_p -> The ID of the global memory variable |

| GPUart_Retval gpuI_preemptJob | ( | kernel_task_id_e | task_id_p | ) |

| uint32 gpuI_queryKernelIsRunning | ( | kernel_task_id_e | task_id_e | ) |
| uint32 gpuI_queryKernelPreempted | ( | kernel_task_id_e | task_id_e | ) |

| uint32 gpuI_queryKernelTerminatedSuccessful | ( | kernel_task_id_e | task_id_e | ) |

| GPUart_Retval gpuI_runJob | ( | kernel_task_id_e | task_id_e | ) |
Request the launch of a GPGPU kernel.
| kernel_task_id_e | task_id_e -> The ID of the kernel to be launched. |

| GPUart_Retval gpuI_SetKernelStatusReady | ( | kernel_task_id_e | task_id_e | ) |

| GPUart_Retval gpuI_start | ( | void | ) |

| GPUart_Retval gpuI_stop | ( | void | ) |

|
static |
The constant memory table.
All constant memory buffers which must be written during runtime must be defined here. The i'th element represents the i'th constant memory buffer, define by device_constant_memory_id_e in GPUart_Config.h. Each element must defined in the following style: { (void **)& CONSTANT_BUFFER_NAME, SIZE_IN_BYTES }.
|
static |
The kernel state machine table.
The i'th element represents the i'th kernel, according to the enum kernel_task_id_e in GPUart_Config.h. Each element must defined in the following style: &NAME_OF_STATE_MACHINE_POINTER. Use NULL if the related kernel is non-preemptive.
|
static |
The preemption flag table.
All preemption flags must be included by this table. The i'th element represents the i'th kernel, according to the enum kernel_task_id_e in GPUart_Config.h. Each element must defined in the following style: (volatile sint32**)& NAME_OF_PREEMPTION_FLAG_POINTER. If a kernel does not implement a preemption flag, because it is non-preemptive, insert a NULL.
|
static |
The global memory table.
All global memory buffers which must be written or red during runtime must be defined here. The i'th element represents the i'th global memory buffer, define by device_global_memory_id_e in GPUart_Config.h. Each element must defined in the following style: { (void **)& GLOBAL_MEMORY_BUFFER_POINTER_DEVICE, GLOBAL_MEMORY_BUFFER_POINTER_HOST, SIZE_IN_BYTES }.
|
static |
The device ID of the used GPU.
|
static |
The job cost table.
The i'th element represents the i'th kernel, according to the enum kernel_task_id_e in GPUart_Config.h. Each element represents the job costs of the related kernel. If a thread block of a kernel requires more then 1/µ of the available registers, shared memory, thread residency slots, or thread block residency slots of an Streaming Multiprocessor, then set corresponding value to m * µ, whereby µ is the resource factor and m is the GPU's number of Streaming Multiprocessors. If a thread block of a kernel requires less then 1/µ of each resource type, then set the corresponding value to the kernels number of thread blocks.
| uint32 max_costs_per_kernel = 0 |
The allowed job cost per kernel.
This value is equal to m * µ, whereby m is the number of Streaming Multiprocessors of the GPU gpuS_nrOfMultiprocessor_u32 and µ is the resource factor C_GPUS_RESOURCE_FACTOR.
|
static |
The command queue (stream) for memory operations.
| uint32* mm_buffer_blockX_g |
| uint32* mm_buffer_blockY_g |
| uint32* mm_buffer_M_g |
| float32* mm_matrix_A_f32_g |
| float32 * mm_matrix_A_f32_host |
| float32* mm_matrix_B_f32_g |
| float32 * mm_matrix_B_f32_host |
| float32* mm_matrix_C_f32_g |
| float32 * mm_matrix_C_f32_host |
|
static |
The number of state machines table.
The i'th element represents the i'th kernel, according to the enum kernel_task_id_e in GPUart_Config.h. Each element must defined in the following style: NUMBER_OF_SM_IN_KERNEL. If a kernel preempts grid-synchronous then use the value 1u. If a kernel preempts thread-block synchronous then use the number of thread blocks of this kernel. If a kernel is non-preemptive then use 0u.
| perKer_eventQueue_s32_g |
The event queue for kernel launch requests, written by the CPU and red by the GPU (device pointer).
To request a kernel launch, write the kernel's ID (kernel_task_id_e) into the tail of the queue. Write C_PERSISTENT_KERNEL_TERMINATE to terminate the persistent kernel GPUart_Persistent_Kernel.
| perKer_eventQueue_s32_host |
The event queue for kernel launch requests, written by the CPU and red by the GPU (host pointer).
To request a kernel launch, write the kernel's ID (kernel_task_id_e) into the tail of the queue. Write C_PERSISTENT_KERNEL_TERMINATE to terminate the persistent kernel GPUart_Persistent_Kernel.
| perKer_eventQueueCntDevice_u32_g |
The index of the head of the event queue for kernel launches written by the device (device pointer).
| perKer_eventQueueCntDevice_u32_host |
The index of the head of the event queue for kernel launches written by the device (host pointer).
| perKer_eventQueueCntHost_u32_g |
The index of the tail of the event queue for kernel launches written by the host (device pointer).
| perKer_eventQueueCntHost_u32_host |
The index of the tail of the event queue for kernel launches written by the host (host pointer).
| perKer_isRunning_u32_g |
A status flag, which represents the running status of the persistent kernel (device pointer).
| perKer_isRunning_u32_host |
A status flag, which represents the running status of the persistent kernel (host pointer).
| perKer_kernelTasksRunningStates_u32_g |
A status flag, which represents the running status of each kernel (device pointer).
| perKer_kernelTasksRunningStates_u32_host |
A status flag, which represents the running status of each kernel (host pointer).
|
static |
The command queue (stream) for the persistent kernel.
| sint32* preempt_MM_flag_g |
| volatile sint32* preempt_MM_flag_host |
| sint32* preempt_MM_sm_g |
| volatile sint32* preempt_MM_sm_host |
| sint32* preempt_SOB1_flag_g |
| volatile sint32* preempt_SOB1_flag_host |
| sint32* preempt_SOB1_flag_internal_g |
| sint32* preempt_SOB1_sm_g |
| volatile sint32* preempt_SOB1_sm_host |
| sint32* preempt_SOB2_flag_g |
| volatile sint32* preempt_SOB2_flag_host |
| sint32* preempt_SOB2_flag_internal_g |
| sint32* preempt_SOB2_sm_g |
| volatile sint32* preempt_SOB2_sm_host |
|
static |
The preemption enabled table.
The i'th element represents the i'th kernel, according to the enum kernel_task_id_e in GPUart_Config.h. Each element must defined in the following style: C_TRUE if the related kernel is preemptive; C_FALSE if the related kernel is non-preemptive.
| uint32* sob1_buffer_loop_counter_u32_g |
| sint32* sob1_matrix_in_s32_g |
| sint32 * sob1_matrix_in_s32_host |
| sint32* sob1_matrix_out_s32_g |
| sint32 * sob1_matrix_out_s32_host |
| uint32* sob2_buffer_loop_counter_u32_g |
| sint32* sob2_matrix_in_s32_g |
| sint32 * sob2_matrix_in_s32_host |
| sint32* sob2_matrix_out_s32_g |
| sint32 * sob2_matrix_out_s32_host |
| uint32* sync_MM_flags_in_u32_g |
| uint32* sync_MM_flags_out_u32_g |
| uint32* sync_SOB1_flags_in_u32_g |
| uint32* sync_SOB1_flags_out_u32_g |
| uint32* sync_SOB2_flags_in_u32_g |
| uint32* sync_SOB2_flags_out_u32_g |
1.8.6