On this weblog i am exloring how the native reminiscence operates with regard to a piece group (of labor objects).
We create a easy kernel that can export IDs , world id, native id, group id of a piece merchandise .
Moreover we’ll instantiate an area integer utilizing prefix __local throughout the kernel operate
and we’ll improve it (++) .
Native reminiscence is shared inside a piece group so we’ll see what occurs with execution as we seize the worth of that
variable and cross it to the worldwide slot of the work merchandise that’s working .
Ow yeah we may also have an output array together with the IDs and throw the worth it noticed in there .
So whereas we’ll have one native variable we’ll get its state throughout a number of work objects.
that is the code :
#property model “1.00”
int OnInit()
{
EventSetMillisecondTimer(33);
return(INIT_SUCCEEDED);
}
void OnDeinit(const int purpose)
{
}
void OnTimer(){
EventKillTimer();
int ctx=CLContextCreate(CL_USE_GPU_DOUBLE_ONLY);
if(ctx!=INVALID_HANDLE){
string kernel=“__kernel void memtests(__global int* global_id,”
“__global int* local_id,”
“__global int* group_id,”
“__global int* output){rn”
“//initialized as soon as in native reminiscence for every compute unitrn”
“__local int f;”
“f++;”
“output[get_global_id(0)]=f;”
“global_id[get_global_id(0)]=get_global_id(0);”
“local_id[get_global_id(0)]=get_local_id(0);”
“group_id[get_global_id(0)]=get_group_id(0);}”;
string errors=“”;
int prg=CLProgramCreate(ctx,kernel,errors);
if(prg!=INVALID_HANDLE){
ResetLastError();
int ker=CLKernelCreate(prg,“memtests”);
if(ker!=INVALID_HANDLE){
int objects=8;
int global_ids[];ArrayResize(global_ids,objects,0);
ArrayFill(global_ids,0,objects,0);
int local_ids[];ArrayResize(local_ids,objects,0);
ArrayFill(local_ids,0,objects,0);
int group_ids[];ArrayResize(group_ids,objects,0);
int output[];ArrayResize(output,objects,0);
int global_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int local_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int group_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int output_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
CLSetKernelArgMem(ker,0,global_id_handle);
CLSetKernelArgMem(ker,1,local_id_handle);
CLSetKernelArgMem(ker,2,group_id_handle);
CLSetKernelArgMem(ker,3,output_handle);
uint offsets[]={0};
uint works[]={objects};
CLExecute(ker,1,offsets,works);
whereas(CLExecutionStatus(ker)!=CL_COMPLETE){Sleep(10);}
Print(“Kernel completed”);
CLBufferRead(global_id_handle,global_ids,0,0,objects);
CLBufferRead(local_id_handle,local_ids,0,0,objects);
CLBufferRead(group_id_handle,group_ids,0,0,objects);
CLBufferRead(output_handle,output,0,0,objects);
int f=FileOpen(“OCLlocalmemtestlog.txt”,FILE_WRITE|FILE_TXT);
for(int i=0;i<objects;i++){
FileWriteString(f,“GLOBAL.ID[“+IntegerToString(i)+“]=”+IntegerToString(global_ids[i])+” : LOCAL.ID[“+IntegerToString(i)+“]=”+IntegerToString(local_ids[i])+” : GROUP.ID[“+IntegerToString(i)+“]=”+IntegerToString(group_ids[i])+” : OUTPUT[“+IntegerToString(i)+“]=”+IntegerToString(output[i])+“n”);
}
FileClose(f);
int groups_created=group_ids[0];
for(int i=0;i<ArraySize(group_ids);i++){
if(group_ids[i]>groups_created){groups_created=group_ids[i];}
}
int compute_units=CLGetInfoInteger(ker,CL_DEVICE_MAX_COMPUTE_UNITS);
int kernel_local_mem_size=CLGetInfoInteger(ker,CL_KERNEL_LOCAL_MEM_SIZE);
int kernel_private_mem_size=CLGetInfoInteger(ker,CL_KERNEL_PRIVATE_MEM_SIZE);
int kernel_work_group_size=CLGetInfoInteger(ker,CL_KERNEL_WORK_GROUP_SIZE);
int device_max_work_group_size=CLGetInfoInteger(ctx,CL_DEVICE_MAX_WORK_GROUP_SIZE);
Print(“Kernel native mem (“+kernel_local_mem_size+“)”);
Print(“Kernel personal mem (“+kernel_private_mem_size+“)”);
Print(“Kernel work group measurement (“+kernel_work_group_size+“)”);
Print(“System max work group measurement(“+device_max_work_group_size+“)”);
Print(“System max compute models(“+compute_units+“)”);
Print(“System Native Mem Measurement (“+CLGetInfoInteger(ctx,CL_DEVICE_LOCAL_MEM_SIZE)+“)”);
Print(“——————“);
Print(“Teams created : “+IntegerToString(groups_created+1));
CLKernelFree(ker);
CLBufferFree(global_id_handle);
CLBufferFree(local_id_handle);
CLBufferFree(group_id_handle);
CLBufferFree(output_handle);
}else{Print(“Can’t create kernel”);}
CLProgramFree(prg);
}else{Alert(errors);}
CLContextFree(ctx);
}
else{
Print(“Can’t create ctx”);
}
}
We output the contents of the id arrays and output array to a file and examine them . Let’s run it for 8 objects , we should surpass 256 objects on this system for it to start out spliting routinely to teams as reported by Kernel work group measurement. ,we now have 8 objects and so there’ll solely be 1 group.
Right here is the file output :
GLOBAL.ID[0]=0 : LOCAL.ID[0]=0 : GROUP.ID[0]=0 : OUTPUT[0]=1
GLOBAL.ID[1]=1 : LOCAL.ID[1]=1 : GROUP.ID[1]=0 : OUTPUT[1]=1
GLOBAL.ID[2]=2 : LOCAL.ID[2]=2 : GROUP.ID[2]=0 : OUTPUT[2]=1
GLOBAL.ID[3]=3 : LOCAL.ID[3]=3 : GROUP.ID[3]=0 : OUTPUT[3]=1
GLOBAL.ID[4]=4 : LOCAL.ID[4]=4 : GROUP.ID[4]=0 : OUTPUT[4]=1
GLOBAL.ID[5]=5 : LOCAL.ID[5]=5 : GROUP.ID[5]=0 : OUTPUT[5]=1
GLOBAL.ID[6]=6 : LOCAL.ID[6]=6 : GROUP.ID[6]=0 : OUTPUT[6]=1
GLOBAL.ID[7]=7 : LOCAL.ID[7]=7 : GROUP.ID[7]=0 : OUTPUT[7]=1
We are able to see all of the ids printed and the output , all of the values are 1 .
You in all probability anticipated that as all these things executed on the identical time , so , the preliminary worth they noticed earlier than the ++’ed it was 0.
The specs state that we instantiate the native integer f for the compute unit however a piece group runs in a single compute unit , so the following query is can we instantiate it per work group as nicely ?
Let’s discover out , let’s add an area[] uint to ship the execute operate , as seen within the earlier blogs , to separate the work in 2 work teams with 4 work objects every .
We’ll see the identical output in all probability and the one change will likely be in native ids and group ids
That is the road we add above the execute operate and to make use of it we simply add it because the final argument within the execution operate.
(we’re specifying 4 objects per group on this dimension)
uint native[]={4};
CLExecute(ker,1,offsets,works,native);
As anticipated our code creates 2 teams :
2023.05.04 00:59:05.922 blog_simple_local_mem_operation (USDJPY,H1) Teams created : 2
And that is the output file :
GLOBAL.ID[0]=0 : LOCAL.ID[0]=0 : GROUP.ID[0]=0 : OUTPUT[0]=1
GLOBAL.ID[1]=1 : LOCAL.ID[1]=1 : GROUP.ID[1]=0 : OUTPUT[1]=1
GLOBAL.ID[2]=2 : LOCAL.ID[2]=2 : GROUP.ID[2]=0 : OUTPUT[2]=1
GLOBAL.ID[3]=3 : LOCAL.ID[3]=3 : GROUP.ID[3]=0 : OUTPUT[3]=1
GLOBAL.ID[4]=4 : LOCAL.ID[4]=0 : GROUP.ID[4]=1 : OUTPUT[4]=1
GLOBAL.ID[5]=5 : LOCAL.ID[5]=1 : GROUP.ID[5]=1 : OUTPUT[5]=1
GLOBAL.ID[6]=6 : LOCAL.ID[6]=2 : GROUP.ID[6]=1 : OUTPUT[6]=1
GLOBAL.ID[7]=7 : LOCAL.ID[7]=3 : GROUP.ID[7]=1 : OUTPUT[7]=1
as anticipated , the native reminiscence integer f is instantiated (or allotted? what you name it) per work group .
Superior.
However what if you’d like the worth to extend (of f) throughout the work merchandise with the intention to use it ?
There are instructions to do this with the atomic_ prefix , on this case we have an interest within the atomic_inc .
What they do is actually “guard” the realm across the variable f till the work merchandise modifications it , so i assume it has a small hit on pace .
(hope i am not butchering the reason right here)
So let’s write a model of the above which exports each the atomic and non atomic worth , we’ll identify these accordingly
the code now seems to be like this :
void OnTimer(){
EventKillTimer();
int ctx=CLContextCreate(CL_USE_GPU_DOUBLE_ONLY);
if(ctx!=INVALID_HANDLE){
string kernel=“__kernel void memtests(__global int* global_id,”
“__global int* local_id,”
“__global int* group_id,”
“__global int* atomic_output,”
“__global int* non_atomic_output){rn”
“//initialized as soon as in native reminiscence for every compute unitrn”
“__local int with_atomic,without_atomic;”
“with_atomic=0;”
“without_atomic=0;”
“atomic_inc(&with_atomic);”
“without_atomic++;”
“atomic_output[get_global_id(0)]=with_atomic;”
“non_atomic_output[get_global_id(0)]=without_atomic;”
“global_id[get_global_id(0)]=get_global_id(0);”
“local_id[get_global_id(0)]=get_local_id(0);”
“group_id[get_global_id(0)]=get_group_id(0);}”;
string errors=“”;
int prg=CLProgramCreate(ctx,kernel,errors);
if(prg!=INVALID_HANDLE){
ResetLastError();
int ker=CLKernelCreate(prg,“memtests”);
if(ker!=INVALID_HANDLE){
int objects=8;
int global_ids[];ArrayResize(global_ids,objects,0);
ArrayFill(global_ids,0,objects,0);
int local_ids[];ArrayResize(local_ids,objects,0);
ArrayFill(local_ids,0,objects,0);
int group_ids[];ArrayResize(group_ids,objects,0);
int atomic_output[];ArrayResize(atomic_output,objects,0);
int non_atomic_output[];ArrayResize(non_atomic_output,objects,0);
int global_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int local_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int group_id_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int atomic_output_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
int non_atomic_output_handle=CLBufferCreate(ctx,objects*4,CL_MEM_WRITE_ONLY);
CLSetKernelArgMem(ker,0,global_id_handle);
CLSetKernelArgMem(ker,1,local_id_handle);
CLSetKernelArgMem(ker,2,group_id_handle);
CLSetKernelArgMem(ker,3,atomic_output_handle);
CLSetKernelArgMem(ker,4,non_atomic_output_handle);
uint offsets[]={0};
uint works[]={objects};
uint native[]={4};
CLExecute(ker,1,offsets,works,native);
whereas(CLExecutionStatus(ker)!=CL_COMPLETE){Sleep(10);}
Print(“Kernel completed”);
CLBufferRead(global_id_handle,global_ids,0,0,objects);
CLBufferRead(local_id_handle,local_ids,0,0,objects);
CLBufferRead(group_id_handle,group_ids,0,0,objects);
CLBufferRead(atomic_output_handle,atomic_output,0,0,objects);
CLBufferRead(non_atomic_output_handle,non_atomic_output,0,0,objects);
int f=FileOpen(“OCLlocalmemtestlog.txt”,FILE_WRITE|FILE_TXT);
for(int i=0;i<objects;i++){
FileWriteString(f,“GLOBAL.ID[“+IntegerToString(i)+“]=”+IntegerToString(global_ids[i])+” : LOCAL.ID[“+IntegerToString(i)+“]=”+IntegerToString(local_ids[i])+” : GROUP.ID[“+IntegerToString(i)+“]=”+IntegerToString(group_ids[i])+” : ATOMIC.OUTPUT[“+IntegerToString(i)+“]=”+IntegerToString(atomic_output[i])+” : NON-ATOMIC.OUTPUT[“+IntegerToString(i)+“]=”+IntegerToString(non_atomic_output[i])+“n”);
}
FileClose(f);
int groups_created=group_ids[0];
for(int i=0;i<ArraySize(group_ids);i++){
if(group_ids[i]>groups_created){groups_created=group_ids[i];}
}
int compute_units=CLGetInfoInteger(ker,CL_DEVICE_MAX_COMPUTE_UNITS);
int kernel_local_mem_size=CLGetInfoInteger(ker,CL_KERNEL_LOCAL_MEM_SIZE);
int kernel_private_mem_size=CLGetInfoInteger(ker,CL_KERNEL_PRIVATE_MEM_SIZE);
int kernel_work_group_size=CLGetInfoInteger(ker,CL_KERNEL_WORK_GROUP_SIZE);
int device_max_work_group_size=CLGetInfoInteger(ctx,CL_DEVICE_MAX_WORK_GROUP_SIZE);
Print(“Kernel native mem (“+kernel_local_mem_size+“)”);
Print(“Kernel personal mem (“+kernel_private_mem_size+“)”);
Print(“Kernel work group measurement (“+kernel_work_group_size+“)”);
Print(“System max work group measurement(“+device_max_work_group_size+“)”);
Print(“System max compute models(“+compute_units+“)”);
Print(“System Native Mem Measurement (“+CLGetInfoInteger(ctx,CL_DEVICE_LOCAL_MEM_SIZE)+“)”);
Print(“——————“);
Print(“Teams created : “+IntegerToString(groups_created+1));
CLKernelFree(ker);
CLBufferFree(global_id_handle);
CLBufferFree(local_id_handle);
CLBufferFree(group_id_handle);
CLBufferFree(atomic_output_handle);
CLBufferFree(non_atomic_output_handle);
}else{Print(“Can’t create kernel”);}
CLProgramFree(prg);
}else{Alert(errors);}
CLContextFree(ctx);
}
else{
Print(“Can’t create ctx”);
}
}
So we initialize per group the variables with_atomic and without_atomic
and we will likely be exporting the their values too . Let’s run it with the identical objects and native objects
GLOBAL.ID[0]=0 : LOCAL.ID[0]=0 : GROUP.ID[0]=0 : ATOMIC.OUTPUT[0]=4 : NON-ATOMIC.OUTPUT[0]=1
GLOBAL.ID[1]=1 : LOCAL.ID[1]=1 : GROUP.ID[1]=0 : ATOMIC.OUTPUT[1]=4 : NON-ATOMIC.OUTPUT[1]=1
GLOBAL.ID[2]=2 : LOCAL.ID[2]=2 : GROUP.ID[2]=0 : ATOMIC.OUTPUT[2]=4 : NON-ATOMIC.OUTPUT[2]=1
GLOBAL.ID[3]=3 : LOCAL.ID[3]=3 : GROUP.ID[3]=0 : ATOMIC.OUTPUT[3]=4 : NON-ATOMIC.OUTPUT[3]=1
GLOBAL.ID[4]=4 : LOCAL.ID[4]=0 : GROUP.ID[4]=1 : ATOMIC.OUTPUT[4]=4 : NON-ATOMIC.OUTPUT[4]=1
GLOBAL.ID[5]=5 : LOCAL.ID[5]=1 : GROUP.ID[5]=1 : ATOMIC.OUTPUT[5]=4 : NON-ATOMIC.OUTPUT[5]=1
GLOBAL.ID[6]=6 : LOCAL.ID[6]=2 : GROUP.ID[6]=1 : ATOMIC.OUTPUT[6]=4 : NON-ATOMIC.OUTPUT[6]=1
GLOBAL.ID[7]=7 : LOCAL.ID[7]=3 : GROUP.ID[7]=1 : ATOMIC.OUTPUT[7]=4 : NON-ATOMIC.OUTPUT[7]=1
Aaaand we get thiiis …. hmmm
The atomic offers us the final worth it had why ?
nicely if we take a look at the code we’re passing -to the worldwide with_atomic_output array- the worth of the native variable nearly on the finish of the work group’s execution.
So think about this :
4 work objects (of the primary group) enter the compute unit for execution Every one get’s assigned to a Processing Aspect CU initializes the two integers with_atomic and without_atomic Every work merchandise begins executing in parallel Typically a calculation is wayy sooner than a switch to the worldwide reminiscence And we are able to say that aside from the atomic_inc(); operate nothing else holds again the work objects till each reaches the purpose it is presupposed to ship information again to the with_atomic array. So on the time every merchandise reaches the output stage the worth of with_atomic is 4 already.
We would be capable of see it if we place a personal int to the atomic name and add 1 to it .
The khronos documentation states this for the atomic_inc()
Learn the 32-bit worth (known as previous ) saved at location pointed by p . Compute ( previous + 1) and retailer outcome at location pointed by p . The operate returns previous .
So it’s telling us that if we place an integer to the left of the atomic name we’ll obtain the previous worth of the native variable .
Which means the native variable will likely be locked after which the personal integer will obtain the worth at that time , then the operation (++ as a result of inc()) will occur on the native variable after which it should unlock. So we’re getting the worth this work merchandise “used”.
Then the variable we now have will likely be personal , we” slap +1 on it and get the worth at that occasion !
Let’s export that too in reality .
We add yet another buffer , yet another argument , we hyperlink the buffer to the kernel , we retrieve the worth on the finish , we print it within the file and we do not overlook to free the buffer .
Now , the kernel seems to be like this to the half that will get the “previous” from the atomic name :
int this_item_only;
this_item_only=atomic_inc(&with_atomic)+1;
Creates a personal int for every merchandise known as this_item_only , then it will get the previous worth of the native variable with_atomic and provides one to it
-excuse my variable names , its for the take a look at , and from a number of checks i ran –
Right here is the file output :
GLOBAL.ID[0]=0 : LOCAL.ID[0]=0 : GROUP.ID[0]=0 : ATOMIC.OUTPUT[0]=4 : NON-ATOMIC.OUTPUT[0]=1 : INSTANCE.OUTPUT[0]=1
GLOBAL.ID[1]=1 : LOCAL.ID[1]=1 : GROUP.ID[1]=0 : ATOMIC.OUTPUT[1]=4 : NON-ATOMIC.OUTPUT[1]=1 : INSTANCE.OUTPUT[1]=2
GLOBAL.ID[2]=2 : LOCAL.ID[2]=2 : GROUP.ID[2]=0 : ATOMIC.OUTPUT[2]=4 : NON-ATOMIC.OUTPUT[2]=1 : INSTANCE.OUTPUT[2]=3
GLOBAL.ID[3]=3 : LOCAL.ID[3]=3 : GROUP.ID[3]=0 : ATOMIC.OUTPUT[3]=4 : NON-ATOMIC.OUTPUT[3]=1 : INSTANCE.OUTPUT[3]=4
GLOBAL.ID[4]=4 : LOCAL.ID[4]=0 : GROUP.ID[4]=1 : ATOMIC.OUTPUT[4]=4 : NON-ATOMIC.OUTPUT[4]=1 : INSTANCE.OUTPUT[4]=1
GLOBAL.ID[5]=5 : LOCAL.ID[5]=1 : GROUP.ID[5]=1 : ATOMIC.OUTPUT[5]=4 : NON-ATOMIC.OUTPUT[5]=1 : INSTANCE.OUTPUT[5]=2
GLOBAL.ID[6]=6 : LOCAL.ID[6]=2 : GROUP.ID[6]=1 : ATOMIC.OUTPUT[6]=4 : NON-ATOMIC.OUTPUT[6]=1 : INSTANCE.OUTPUT[6]=3
GLOBAL.ID[7]=7 : LOCAL.ID[7]=3 : GROUP.ID[7]=1 : ATOMIC.OUTPUT[7]=4 : NON-ATOMIC.OUTPUT[7]=1 : INSTANCE.OUTPUT[7]=4
Hell yeah .
Now the primary time i did this i used 512 objects (as an alternative of 8) which allowed me to seek out one other doable difficulty :
I used to be getting “ATOMIC.OUTPUT” values of 224 on the second group as an alternative of 256 (group measurement was 256 per group , 2 teams)
That was carried out by the offset in execution for a few of the work objects , 32 objects have been beginning later resulting in them not having reached the
atomic_inc half but and at that very same time the opposite 224 work objects had hit their export to world reminiscence levels thus reporting 224 because the atomic output.
Here’s a schematic :
The 224 objects attain the worldwide reminiscence half when the reset 32 objects haven’t but hit the atomic_inc half
You will have seen the answer to the articles and its that of a barrier .
What the barrier is gonna do is cease all GROUP ITEMS on the line you place it at till ALL OTHER GROUP ITEMS attain that line too .
That solved the problem that is the road of code :
barrier(CLK_GLOBAL_MEM_FENCE);
If we specified CLK_LOCAL_MEM_FENCE , then the objects IN THE GROUP wouldn’t be capable of do something to the native reminiscence till ALL ITEMS of that group had hit that line .
The place do you assume this line went ?
Appropriate , earlier than the export to the worldwide reminiscence .
I am attaching the ultimate supply code .
Cheers