There is no limitation to do just integer math, here is code listing for +-*/ floating point (SINGLE) operations:
'
' Operation on two vectors calculated on GPU
' [!] GeForce 8 and newer, or Radeon HD 4xxx and newer with OpenCL enabled
' [!] drivers REQUIRED
'
' Code could be further optimized by separating the JIT compiling part
'
' Petr Schreiber, José Roca, 2010
' USE AT OWN RISK
'
' Code heavily based on fantastic C code by Yuan Wan:
' https://www.wiki.ed.ac.uk/display/ecdfwiki/OpenCL+quick+start
'
#INCLUDE "cl.inc"
%TRUE = -1
%FALSE= 0
' -- Do you want to do error checking?
%USE_ERROR_CHECKING = %TRUE
' -- Operation equates
%OPERATION_SUM = 1
%OPERATION_SUBTRACT = 2
%OPERATION_MULTIPLY = 3
%OPERATION_DIVIDE = 4
MACRO OPERATION_DECODE(operation) = CHOOSE$(operation, "+", "-", "*", "/")
' -- How many element will the array have
%ELEMENTS = 9
FUNCTION PBMAIN()
DIM InputVectorA(%ELEMENTS-1) AS SINGLE
DIM InputVectorB(%ELEMENTS-1) AS SINGLE
DIM OutputVector(%ELEMENTS-1) AS SINGLE
ARRAY ASSIGN InputVectorA() = 1, 2, 3, 4, 5, 6, 7, 8, 9
ARRAY ASSIGN InputVectorB() = 9, 8, 7, 6, 5, 4, 3, 2, 1
LOCAL op AS LONG
FOR op = %OPERATION_SUM TO %OPERATION_DIVIDE
OpenCL_CombineArraysSINGLE(op, OutputVector(), InputVectorA(), InputVectorB())
MSGBOX FORMAT_CombineArraysRESULT(op, OutputVector(), InputVectorA(), InputVectorB()), %MB_ICONINFORMATION OR %MB_OK, "OpenCL calculation"
NEXT
FUNCTION = 0
END FUNCTION
FUNCTION OpenCL_CombineArraysSINGLE(operation AS LONG, destinationArrayC() AS SINGLE, arrayA() AS SINGLE, arrayB() AS SINGLE) AS LONG
#IF %USE_ERROR_CHECKING
IF operation < %OPERATION_SUM OR operation > %OPERATION_DIVIDE THEN
MSGBOX "Invalid operation"
EXIT FUNCTION
END IF
IF UBOUND(arrayA) <> UBOUND(arrayB) OR UBOUND(arrayA) <> UBOUND(destinationArrayC) THEN
MSGBOX "The number of dimensions of arrays must match"
EXIT FUNCTION
END IF
IF LBOUND(arrayA) <> 0 OR LBOUND(arrayB) <> 0 OR LBOUND(destinationArrayC) <> 0 THEN
MSGBOX "The lBound must be zero"
EXIT FUNCTION
END IF
#ENDIF
REGISTER c AS LONG
LOCAL errorCL AS LONG
LOCAL OpenCLSource AS STRING
LOCAL uBoundOfArrays AS LONG
LOCAL lSizeOfArrays AS LONG
uBoundOfArrays = UBOUND(destinationArrayC)
lSizeOfArrays = uBoundOfArrays + 1
OpenCLSource = "__kernel void VectorAdd(__global float* c, __global float* a,__global float* b)"+$CRLF+ _
"{ "+$CRLF+ _
" // Index of the elements to add "+$CRLF+ _
" unsigned int n = get_global_id(0); "+$CRLF+ _
" "+$CRLF+ _
" // Sum the n'th element of vectors a and b and store in c "+$CRLF+ _
" c[n] = a[n] "+OPERATION_DECODE(operation)+" b[n]; "+$CRLF+ _
"} "
LOCAL platforms AS DWORD ' %CL_platform_id
LOCAL platformsFound AS DWORD
errorCL = clGetPlatformIDs(1, platforms, platformsFound)
#IF %USE_ERROR_CHECKING
IF (errorCL <> %CL_SUCCESS) THEN
MSGBOX "Failed to enumerate platforms!" + openCL_DecodeError(errorCL)
END IF
#ENDIF
LOCAL numDevices AS DWORD
LOCAL device_ID AS DWORD
errorCL = clGetDeviceIDs(platforms, %CL_DEVICE_TYPE_GPU, 1, device_id, numDevices)
#IF %USE_ERROR_CHECKING
IF (errorCL <> %CL_SUCCESS) THEN
MSGBOX "Failed to create a device group!" + openCL_DecodeError(errorCL)
END IF
#ENDIF
' Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
LOCAL GPUContext AS DWORD ' %CL_context
GPUContext = clCreateContext(0, 1, device_id, BYVAL 0, BYVAL 0, errorCL)
#IF %USE_ERROR_CHECKING
IF (errorCL <> %CL_SUCCESS) THEN
MSGBOX "Failed to create a context!" + openCL_DecodeError(errorCL)
END IF
#ENDIF
' Get the list of GPU devices associated with this context
LOCAL ParmDataBytes AS DWORD
clGetContextInfo(GPUContext, %CL_CONTEXT_DEVICES, 0, BYVAL 0, ParmDataBytes)
#IF %USE_ERROR_CHECKING
IF ParmDataBytes = 0 THEN MSGBOX "clGetContextInfo: Incorrect size of devices" : EXIT FUNCTION
#ENDIF
LOCAL GPUDevices AS DWORD PTR ' type_cl_device_id PTR
GPUDevices = Memory_Alloc(ParmDataBytes)
' clGetContextInfo(GPUContext, %CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, BYVAL 0)
clGetContextInfo(GPUContext, %CL_CONTEXT_DEVICES, ParmDataBytes, BYVAL GPUDevices, BYVAL 0)
#IF %USE_ERROR_CHECKING
IF GPUDevices = 0 THEN MSGBOX "clGetContextInfo: Zero GPUDevices" : EXIT FUNCTION
#ENDIF
' Create a command-queue on the first GPU device
LOCAL GPUCommandQueue AS DWORD ' type_cl_command_queue
GPUCommandQueue = clCreateCommandQueue(GPUContext, @GPUDevices, 0, errorCL)
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "clCreateCommandQueue: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' Allocate GPU memory for source vectors AND initialize from CPU memory
LOCAL GPUVector1 AS DWORD 'type_cl_mem
LOCAL GPUVector2 AS DWORD 'type_cl_mem
GPUVector1 = clCreateBuffer(GPUContext, %CL_MEM_READ_ONLY OR %CL_MEM_COPY_HOST_PTR, 4 * lSizeOfArrays, arrayA(0), errorCL)
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "GPUVector1: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
GPUVector2 = clCreateBuffer(GPUContext, %CL_MEM_READ_ONLY OR %CL_MEM_COPY_HOST_PTR, 4 * lSizeOfArrays, arrayB(0), errorCL)
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "GPUVector2: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' Allocate output memory on GPU
LOCAL GPUOutputVector AS DWORD'type_cl_mem
GPUOutputVector = clCreateBuffer(GPUContext, %CL_MEM_WRITE_ONLY, 4 * lSizeOfArrays, BYVAL 0, errorCL)
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "GPUOutputVector: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' Create OpenCL program with source code
LOCAL OpenCLProgram AS DWORD 'type_cl_program
OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, OpenCLSource, 0, errorCL) ' vptr
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "clCreateProgramWithSource: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' Build the program (OpenCL JIT compilation)
errorCL = clBuildProgram(OpenCLProgram, 0, BYVAL 0, BYVAL 0, 0, BYVAL 0)
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "clBuildProgram: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' Create a handle to the compiled OpenCL function (Kernel)
LOCAL OpenCLVectorAdd AS DWORD 'type_cl_kernel
LOCAL kernelName AS STRING
kernelName = "VectorAdd"
OpenCLVectorAdd = clCreateKernel(OpenCLProgram, BYCOPY kernelName, errorCL) ' -- jenom 1x char
#IF %USE_ERROR_CHECKING
IF errorCL THEN MSGBOX "clCreateKernel: "+openCL_DecodeError(errorCL) : EXIT FUNCTION
#ENDIF
' In the next step we associate the GPU memory with the Kernel arguments
' This is basically setting the parameters for kernel function
clSetKernelArg(OpenCLVectorAdd, 0, 4, GPUOutputVector)
clSetKernelArg(OpenCLVectorAdd, 1, 4, GPUVector1)
clSetKernelArg(OpenCLVectorAdd, 2, 4, GPUVector2)
' Launch the Kernel on the GPU
DIM WorkSize AS DWORD ' one dimensional Range
WorkSize = lSizeOfArrays
clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, BYVAL 0, WorkSize, BYVAL 0, 0, BYVAL 0, BYVAL 0)
' Copy the output in GPU memory back to CPU memory
clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, %CL_TRUE, 0, 4 * lSizeOfArrays, destinationArrayC(0), 0, BYVAL 0, BYVAL 0)
' Cleanup
Memory_Free(GPUDevices)
clReleaseKernel(OpenCLVectorAdd)
clReleaseProgram(OpenCLProgram)
clReleaseCommandQueue(GPUCommandQueue)
clReleaseContext(GPUContext)
clReleaseMemObject(GPUVector1)
clReleaseMemObject(GPUVector2)
clReleaseMemObject(GPUOutputVector)
FUNCTION = 0
END FUNCTION
FUNCTION OpenCL_DecodeError(eCode AS LONG) AS STRING
SELECT CASE eCode
CASE 0
FUNCTION = "No problem..."
CASE -30
FUNCTION = "CL_INVALID_VALUE"
CASE -31
FUNCTION = "CL_INVALID_DEVICE_TYPE"
CASE -32
FUNCTION = "CL_INVALID_PLATFORM"
CASE -33
FUNCTION = "CL_INVALID_DEVICE"
CASE -34
FUNCTION = "CL_INVALID_CONTEXT"
CASE -35
FUNCTION = "CL_INVALID_QUEUE_PROPERTIES"
CASE -36
FUNCTION = "CL_INVALID_COMMAND_QUEUE"
CASE -37
FUNCTION = "CL_INVALID_HOST_PTR"
CASE -38
FUNCTION = "CL_INVALID_MEM_OBJECT"
CASE -39
FUNCTION = "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"
CASE -40
FUNCTION = "CL_INVALID_IMAGE_SIZE"
CASE -41
FUNCTION = "CL_INVALID_SAMPLER"
CASE -42
FUNCTION = "CL_INVALID_BINARY"
CASE -43
FUNCTION = "CL_INVALID_BUILD_OPTIONS"
CASE -44
FUNCTION = "CL_INVALID_PROGRAM"
CASE -45
FUNCTION = "CL_INVALID_PROGRAM_EXECUTABLE"
CASE -46
FUNCTION = "CL_INVALID_KERNEL_NAME"
CASE -47
FUNCTION = "CL_INVALID_KERNEL_DEFINITION"
CASE -48
FUNCTION = "CL_INVALID_KERNEL"
CASE -49
FUNCTION = "CL_INVALID_ARG_INDEX"
CASE -50
FUNCTION = "CL_INVALID_ARG_VALUE"
CASE -51
FUNCTION = "CL_INVALID_ARG_SIZE"
CASE -52
FUNCTION = "CL_INVALID_KERNEL_ARGS"
CASE -53
FUNCTION = "CL_INVALID_WORK_DIMENSION"
CASE -54
FUNCTION = "CL_INVALID_WORK_GROUP_SIZE"
CASE -55
FUNCTION = "CL_INVALID_WORK_ITEM_SIZE"
CASE -56
FUNCTION = "CL_INVALID_GLOBAL_OFFSET"
CASE -57
FUNCTION = "CL_INVALID_EVENT_WAIT_LIST"
CASE -58
FUNCTION = "CL_INVALID_EVENT"
CASE -59
FUNCTION = "CL_INVALID_OPERATION"
CASE -60
FUNCTION = "CL_INVALID_GL_OBJECT"
CASE -61
FUNCTION = "CL_INVALID_BUFFER_SIZE"
CASE -62
FUNCTION = "CL_INVALID_MIP_LEVEL"
CASE -63
FUNCTION = "CL_INVALID_GLOBAL_WORK_SIZE"
CASE ELSE
FUNCTION = "[!] Unknown error:"+STR$(eCode)
END SELECT
END FUNCTION
FUNCTION FORMAT_CombineArraysRESULT(operation AS LONG, OutputVector() AS SINGLE, InputVectorA() AS SINGLE, InputVectorB() AS SINGLE) AS STRING
REGISTER c AS LONG
LOCAL sOutput AS STRING
FOR c = 0 TO %ELEMENTS-1
sOutput += FORMAT$(InputVectorA(c))+$TAB
NEXT
sOutput = RTRIM$(sOutput, $TAB)+$CRLF+OPERATION_DECODE(operation)+$CRLF
FOR c = 0 TO %ELEMENTS-1
sOutput += FORMAT$(InputVectorB(c))+$TAB
NEXT
sOutput = RTRIM$(sOutput, $TAB)+$CRLF+"="+$CRLF
FOR c = 0 TO %ELEMENTS-1
sOutput += FORMAT$(OutputVector(c))+$TAB
NEXT
sOutput = RTRIM$(sOutput, $TAB)
FUNCTION = sOutput
END FUNCTION
FUNCTION Memory_Alloc(numberOfBytes AS LONG) AS DWORD
LOCAL dwordPointer AS DWORD
GLOBALMEM ALLOC numberOfBytes TO dwordPointer
FUNCTION = dwordPointer
END FUNCTION
FUNCTION Memory_Free(dwordPointer AS DWORD) AS DWORD
GLOBALMEM FREE dwordPointer TO dwordPointer
FUNCTION = dwordPointer
END FUNCTION
It is also possible to do calculations with DOUBLEs, but for this you would need really cutting edge hardware, while this example runs on anything OpenCL enabled.