From 9ecb30f9594f222d8318fb1e803a4c363b0c39e5 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sat, 20 May 2023 17:57:39 +0300 Subject: [PATCH] OpenCL: Fixes for older devices. (#1435) * Remove `constant` * Rewrite platform and device selection * Fix Q8_0 --- ggml-opencl.c | 375 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 244 insertions(+), 131 deletions(-) diff --git a/ggml-opencl.c b/ggml-opencl.c index 31ab13b..e26631f 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -10,87 +10,77 @@ #include "ggml.h" #define MULTILINE_QUOTE(...) #__VA_ARGS__ -const char * clblast_dequant = MULTILINE_QUOTE( +static const char * program_source = MULTILINE_QUOTE( +typedef char int8_t; typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; -constant uint QK4_0 = 32; -struct block_q4_0 +struct __attribute__ ((packed)) block_q4_0 { - float d; - uint8_t qs[QK4_0 / 2]; + half d; + uint8_t qs[16]; /* QK4_0 / 2 */ }; -constant uint QK4_1 = 32; -struct block_q4_1 +struct __attribute__ ((packed)) block_q4_1 { - float d; - float m; - uint8_t qs[QK4_1 / 2]; + half d; + half m; + uint8_t qs[16]; /* QK4_1 / 2 */ }; -constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; - uint8_t qs[QK5_0 / 2]; + uint8_t qs[16]; /* QK5_0 / 2 */ }; -constant uint QK5_1 = 32; -struct block_q5_1 +struct __attribute__ ((packed)) block_q5_1 { half d; half m; uint32_t qh; - uint8_t qs[QK5_1 / 2]; + uint8_t qs[16]; /* QK5_1 / 2 */ }; -constant uint QK8_0 = 32; -struct block_q8_0 +struct __attribute__ ((packed)) block_q8_0 { - float d; - uint8_t qs[QK8_0]; + half d; + int8_t qs[32]; /* QK8_0 */ }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { - constant uint qk = QK4_0; - - const uint i = get_global_id(0) / qk; + const uint i = get_global_id(0) / 32; /* QK4_0 */ const uint j = get_local_id(0); - const float d = x[i].d; + const float d = vload_half(0, (__global half*) &x[i].d); const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*32 + j + 0 ] = x0*d; + y[i*32 + j + 16] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { - constant uint qk = QK4_1; - - const uint i = get_global_id(0) / qk; + const uint i = get_global_id(0) / 32; /* QK4_1 */ const uint j = get_local_id(0); - const float d = x[i].d; - const float m = x[i].m; + const float d = vload_half(0, (__global half*) &x[i].d); + const float m = vload_half(0, (__global half*) &x[i].m); const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*32 + j + 0 ] = x0*d + m; + y[i*32 + j + 16] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { - constant uint qk = QK5_0; - - const uint i = get_global_id(0) / qk; + const uint i = get_global_id(0) / 32; /* QK5_0 */ const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); @@ -103,14 +93,12 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*32 + j + 0 ] = x0*d; + y[i*32 + j + 16] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { - constant uint qk = QK5_1; - - const uint i = get_global_id(0) / qk; + const uint i = get_global_id(0) / 32; /* QK5_1 */ const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); @@ -124,28 +112,38 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*32 + j + 0 ] = x0*d + m; + y[i*32 + j + 16] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { - constant uint qk = QK8_0; - const uint i = get_global_id(0) / qk; + const uint i = get_global_id(0) / 32; /* QK8_0 */ const uint j = get_local_id(0); - const float d = x[i].d; - y[i*qk + j] = x[i].qs[j]*d; + const float d = vload_half(0, (__global half*) &x[i].d); + y[i*32 + j] = x[i].qs[j]*d; } ); -#define CL_CHECK(err, name) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ - exit(1); \ - } \ +#define CL_CHECK(err) \ + do { \ + cl_int err_ = (err); \ + if (err_ != CL_SUCCESS) { \ + fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ + #err, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +#define CLBLAST_CHECK(err) \ + do { \ + CLBlastStatusCode err_ = (err); \ + if (err_ != CLBlastSuccess) { \ + fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ + #err, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ } while (0) static cl_platform_id platform; @@ -188,48 +186,174 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co void ggml_cl_init(void) { cl_int err = 0; - char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); - char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); - int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); - int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); - printf("\nInitializing CLBlast (First Run)..."); - printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); - cl_uint num_platforms; - clGetPlatformIDs(0, NULL, &num_platforms); - cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); - clGetPlatformIDs(num_platforms, platforms, NULL); - platform = platforms[plat_num]; - char platform_buffer[1024]; - clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); - cl_uint num_devices; - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); - cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); - device = devices[dev_num]; - char device_buffer[1024]; - clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); - printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); - context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); - CL_CHECK(err, "clCreateContext"); - queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); - CL_CHECK(err, "clCreateCommandQueue"); - free(platforms); - free(devices); + struct cl_device; + struct cl_platform { + cl_platform_id id; + unsigned number; + char name[128]; + char vendor[128]; + struct cl_device * devices; + unsigned n_devices; + struct cl_device * default_device; + }; - program = build_program_from_source(context, device, clblast_dequant); + struct cl_device { + struct cl_platform * platform; + cl_device_id id; + unsigned number; + cl_device_type type; + char name[128]; + }; + + enum { NPLAT = 16, NDEV = 16 }; + + struct cl_platform platforms[NPLAT]; + unsigned n_platforms = 0; + struct cl_device devices[NDEV]; + unsigned n_devices = 0; + struct cl_device * default_device = NULL; + + platform = NULL; + device = NULL; + + cl_platform_id platform_ids[NPLAT]; + CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms)); + + for (unsigned i = 0; i < n_platforms; i++) { + struct cl_platform * p = &platforms[i]; + p->number = i; + p->id = platform_ids[i]; + CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); + CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); + + cl_device_id device_ids[NDEV]; + cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); + if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { + p->n_devices = 0; + } else { + CL_CHECK(clGetDeviceIDsError); + } + p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; + p->default_device = NULL; + + for (unsigned j = 0; j < p->n_devices; j++) { + struct cl_device * d = &devices[n_devices]; + d->number = n_devices++; + d->id = device_ids[j]; + d->platform = p; + CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); + CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); + + if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { + p->default_device = d; + } + } + + if (default_device == NULL && p->default_device != NULL) { + default_device = p->default_device; + } + } + + if (n_devices == 0) { + fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n"); + exit(1); + } + + char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); + char * user_device_string = getenv("GGML_OPENCL_DEVICE"); + int user_platform_number = -1; + int user_device_number = -1; + + unsigned n; + if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { + user_platform_number = (int)n; + } + if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { + user_device_number = (int)n; + } + + struct cl_device * selected_devices = devices; + unsigned n_selected_devices = n_devices; + + if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { + for (unsigned i = 0; i < n_platforms; i++) { + struct cl_platform * p = &platforms[i]; + if (strstr(p->name, user_platform_string) != NULL || + strstr(p->vendor, user_platform_string) != NULL) { + user_platform_number = (int)i; + break; + } + } + if (user_platform_number == -1) { + fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); + exit(1); + } + } + if (user_platform_number != -1) { + struct cl_platform * p = &platforms[user_platform_number]; + selected_devices = p->devices; + n_selected_devices = p->n_devices; + default_device = p->default_device; + if (n_selected_devices == 0) { + fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); + exit(1); + } + } + + if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { + for (unsigned i = 0; i < n_selected_devices; i++) { + struct cl_device * d = &selected_devices[i]; + if (strstr(d->name, user_device_string) != NULL) { + user_device_number = d->number; + break; + } + } + if (user_device_number == -1) { + fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string); + exit(1); + } + } + if (user_device_number != -1) { + selected_devices = &devices[user_device_number]; + n_selected_devices = 1; + default_device = &selected_devices[0]; + } + + GGML_ASSERT(n_selected_devices > 0); + + if (default_device == NULL) { + default_device = &selected_devices[0]; + } + + fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); + fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name); + if (default_device->type != CL_DEVICE_TYPE_GPU) { + fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); + } + + platform = default_device->platform->id; + device = default_device->id; + + cl_context_properties properties[] = { + (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 + }; + + CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); + + CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), + (err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err : + (queue = clCreateCommandQueue(context, device, 0, &err), err) + ))); + + program = build_program_from_source(context, device, program_source); // Prepare dequantize kernels - kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); - CL_CHECK(err, "clCreateKernel"); - kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err); - CL_CHECK(err, "clCreateKernel"); + CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err)); + CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err)); + CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); + CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); + CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); } static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { @@ -242,9 +366,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags clReleaseMemObject(*buf); } cl_int err; - *buf = clCreateBuffer(context, flags, req_size, NULL, &err); + CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err)); *cur_size = req_size; - CL_CHECK(err, "clCreateBuffer"); } void ggml_cl_sgemm_wrapper( @@ -253,7 +376,6 @@ void ggml_cl_sgemm_wrapper( const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype) { - cl_int err = 0; cl_kernel kernel; size_t global = n * k, local, size_qb; @@ -267,13 +389,13 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q4_0; local = 16; - size_qb = global * (sizeof(float) + local) / 32; + size_qb = global * (sizeof(ggml_fp16_t) + local) / 32; break; case GGML_TYPE_Q4_1: dequant = true; kernel = kernel_q4_1; local = 16; - size_qb = global * (sizeof(float) * 2 + local) / 32; + size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32; break; case GGML_TYPE_Q5_0: dequant = true; @@ -291,7 +413,7 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q8_0; local = 32; - size_qb = global * (sizeof(float) + local) / 32; + size_qb = global * (sizeof(ggml_fp16_t) + local) / 32; break; default: fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); @@ -313,49 +435,40 @@ void ggml_cl_sgemm_wrapper( cl_event ev_a, ev_qb, ev_b; if (dequant) { - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); - err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); - CL_CHECK(err, "clSetKernelArg"); - err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); - CL_CHECK(err, "clEnqueueWriteBuffer qb"); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b)); + CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb)); } else { - err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); - CL_CHECK(err, "clEnqueueWriteBuffer b"); + CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b)); } - err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); - CL_CHECK(err, "clEnqueueWriteBuffer a"); + CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a)); if (dequant) { - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); - CL_CHECK(err, "clEnqueueNDRangeKernel"); - clReleaseEvent(ev_qb); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b)); + CL_CHECK(clReleaseEvent(ev_qb)); } - clWaitForEvents(1, &ev_a); - clWaitForEvents(1, &ev_b); - clReleaseEvent(ev_a); - clReleaseEvent(ev_b); + CL_CHECK(clWaitForEvents(1, &ev_a)); + CL_CHECK(clWaitForEvents(1, &ev_b)); + CL_CHECK(clReleaseEvent(ev_a)); + CL_CHECK(clReleaseEvent(ev_b)); cl_event ev_sgemm; - CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); - - if (status != CLBlastSuccess) { - fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); - abort(); - } + CLBLAST_CHECK(CLBlastSgemm( + (CLBlastLayout)order, + (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, + m, n, k, + alpha, + cl_buffer_a, 0, lda, + cl_buffer_b, 0, ldb, + beta, + cl_buffer_c, 0, ldc, + &queue, &ev_sgemm)); cl_event ev_c; - clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); + CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c)); // Wait for completion - clWaitForEvents(1, &ev_c); - clReleaseEvent(ev_sgemm); - clReleaseEvent(ev_c); + CL_CHECK(clWaitForEvents(1, &ev_c)); + CL_CHECK(clReleaseEvent(ev_sgemm)); + CL_CHECK(clReleaseEvent(ev_c)); }