diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0973a3f..74d0350 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -36,4 +36,5 @@ else() add_subdirectory(embedding) add_subdirectory(save-load-state) add_subdirectory(benchmark) + add_subdirectory(baby-llama) endif() diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt new file mode 100644 index 0000000..d2ce363 --- /dev/null +++ b/examples/baby-llama/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET baby-llama) +add_executable(${TARGET} baby-llama.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp new file mode 100644 index 0000000..5573c15 --- /dev/null +++ b/examples/baby-llama/baby-llama.cpp @@ -0,0 +1,1687 @@ +#include "ggml.h" +#include +#include +#include +#include + +float frand() { + return (float)rand()/(float)RAND_MAX; +} + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution nd; + float min; + float max; +}; + +void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { + rnd->gen = std::mt19937(seed); + rnd->nd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; +} + +float frand_normal(struct random_normal_distribution * rnd) { + const float r = rnd->nd(rnd->gen); + return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); +} + +struct ggml_tensor * randomize_tensor( + struct ggml_tensor * tensor, + int ndims, + const int64_t ne[], + float fmin, + float fmax) { + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return tensor; +} + +struct ggml_tensor * randomize_tensor_normal( + struct ggml_tensor * tensor, + int ndims, + const int64_t ne[], + struct random_normal_distribution * rnd) { + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i0] = frand_normal(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + } + } + } + } + break; + default: + assert(false); + }; + + return tensor; +} + +struct llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } +}; + +uint32_t get_n_ff(const struct llama_hparams* hparams) { + const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; + return n_ff; +} + +struct llama_hparams_lora { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t n_lora = 64; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct llama_layer_lora { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wqa; + struct ggml_tensor * wqb; + struct ggml_tensor * wka; + struct ggml_tensor * wkb; + struct ggml_tensor * wva; + struct ggml_tensor * wvb; + struct ggml_tensor * woa; + struct ggml_tensor * wob; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + + +struct llama_kv_cache { + struct ggml_context * ctx = NULL; + + struct ggml_tensor * k; + struct ggml_tensor * v; + + // llama_ctx_buffer buf; + + int n; // number of tokens currently in the cache +}; + +struct llama_model { + struct ggml_context * ctx = NULL; + + llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; +}; + +struct llama_model_lora { + struct ggml_context * ctx = NULL; + + llama_hparams_lora hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * outputa; + struct ggml_tensor * outputb; + + std::vector layers; +}; + +void init_model(struct llama_model * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + const uint32_t n_ff = get_n_ff(&hparams); + + struct ggml_context * ctx = model->ctx; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab}); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + // std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + } +} + + +void init_model_lora(struct llama_model_lora * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_mult = hparams.n_mult; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_lora = hparams.n_lora; + + const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + + struct ggml_context * ctx = model->ctx; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); + model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab}); + model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab}); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + // std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + + layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + } +} + +void set_param_model(struct llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +void set_param_model_lora(struct llama_model_lora * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->outputa); + ggml_set_param(ctx, model->outputb); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wqa); + ggml_set_param(ctx, layer.wqb); + ggml_set_param(ctx, layer.wka); + ggml_set_param(ctx, layer.wkb); + ggml_set_param(ctx, layer.wva); + ggml_set_param(ctx, layer.wvb); + ggml_set_param(ctx, layer.woa); + ggml_set_param(ctx, layer.wob); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution rnd; + init_random_normal_distribution(&rnd, seed, mean, std, min, max); + randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); + randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); + randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); + + randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd); + randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd); + randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd); + randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd); + + randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); + + randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); + randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); + randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + } +} + + +void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution rnd; + init_random_normal_distribution(&rnd, seed, mean, std, min, max); + randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); + randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); + randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd); + randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); + + randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd); + randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd); + randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd); + randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd); + randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd); + randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd); + randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd); + randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd); + + randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); + + randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); + randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); + randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + } +} + +bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + + return true; +} + +bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + + return true; +} + +struct ggml_tensor * forward( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past) { + + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->n_dims == 1); + GGML_ASSERT(tensor->ne[0] == ne0); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->n_dims == 2); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->n_dims == 3); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->n_dims == 4); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +struct ggml_tensor * forward_batch( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch) { + + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [N, n_embd, n_batch, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wv, + cur), + n_embd, N, n_batch), + 1, 0, 2, 3)); + + assert_shape_3d(Vcur, N, n_embd, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] + // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_2d(ctx0, kc, + ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), + ggml_element_size(kc)*n_embd*n_ctx, + (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, + ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), + ggml_element_size(vc)*n_ctx*n_embd, + ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); + + assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); + assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); + } + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, n_past + N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_4d(ctx0, + ggml_view_3d(ctx0, + kc, + n_embd, + (n_past + N), + n_batch, + n_embd*ggml_element_size(kc), + n_ctx*n_embd*ggml_element_size(kc), + il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), + n_embd/n_head, n_head, n_past + N, n_batch), + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); + + // K * Q + // KQ shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); + + // split cached V into n_head heads + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] + struct ggml_tensor * V = + ggml_view_4d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, n_batch, + ggml_element_size(vc)*n_ctx, + ggml_element_size(vc)*n_ctx*n_embd/n_head, + ggml_element_size(vc)*n_ctx*n_embd, + il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); + assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + + +struct ggml_tensor * forward_lora( + struct llama_model_lora * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past) { + + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wqa, + ggml_mul_mat(ctx0, + model->layers[il].wqb, + cur)), + n_embd/n_head, n_head, N), + n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wka, + ggml_mul_mat(ctx0, + model->layers[il].wkb, + cur)), + n_embd/n_head, n_head, N), + n_past, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_transpose(ctx0, + ggml_reshape_2d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wva, + ggml_mul_mat(ctx0, + model->layers[il].wvb, + cur)), + n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].woa, + ggml_mul_mat(ctx0, + model->layers[il].wob, + cur)); + } + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, + model->outputa, + ggml_mul_mat(ctx0, + model->outputb, + inpL)); + + // ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { + assert(logits->n_dims == 2); + assert(probs->n_dims == 2); + assert(best_samples->n_dims == 1); + assert(logits->ne[1] == best_samples->ne[0]); + assert(logits->ne[0] == probs->ne[0]); + assert(logits->ne[1] == probs->ne[1]); + for (int i = 0; i < logits->ne[1]; ++i) { + float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]); + ggml_set_i32_1d(best_samples, i, 0); + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + if (logit > max_logit) { + max_logit = logit; + ggml_set_i32_1d(best_samples, i, k); + } + } + float psum = 0; + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit); + psum += p; + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p); + } + for (int k = 0; k < logits->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum); + } + } +} + +void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { + GGML_ASSERT(best_samples->n_dims == 2); + GGML_ASSERT(logits->n_dims == 3); + GGML_ASSERT(probs->n_dims == 3); + int n_tokens = best_samples->ne[0]; + int n_batch = best_samples->ne[1]; + int n_vocab = logits->ne[0]; + GGML_ASSERT(n_tokens == logits->ne[1]); + GGML_ASSERT(n_batch == logits->ne[2]); + GGML_ASSERT(n_vocab == probs->ne[0]); + GGML_ASSERT(n_tokens == probs->ne[1]); + GGML_ASSERT(n_batch == probs->ne[2]); + + for (int k = 0; k < n_batch; ++k) { + struct ggml_tensor * best_samples_k = ggml_view_1d(ctx, + best_samples, + best_samples->ne[0], + k*best_samples->nb[1]); + struct ggml_tensor * logits_k = ggml_view_2d(ctx, + logits, + logits->ne[0], + logits->ne[1], + logits->nb[1], + k*logits->nb[2]); + struct ggml_tensor * probs_k = ggml_view_2d(ctx, + probs, + probs->ne[0], + probs->ne[1], + probs->nb[1], + k*probs->nb[2]); + sample_softmax(logits_k, probs_k, best_samples_k); + } +} + +void print_row(struct ggml_tensor * probs, int i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); +} + +void print_matrix(struct ggml_tensor * probs) { + assert(probs->n_dims == 2); + for (int i = 0; i < probs->ne[1]; ++i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); + } +} + +void print_token(int token, int n_vocab) { + for (int k = 0; k < token; ++k) { + printf(" "); + } + printf("X"); + for (int k = token+1; k < n_vocab; ++k) { + printf(" "); + } + printf("\n"); +} + +void print_tokens(struct ggml_tensor * tokens, int n_vocab) { + for (int i=0; ine[0]; ++i) { + int token = ggml_get_i32_1d(tokens, i); + print_token(token, n_vocab); + } +} + +void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + float randomness = 0.0f; + // ggml_set_zero(targets); + ggml_set_f32(targets, -1.0f); + ggml_set_i32_1d(tokens_input, 0, 0); + for (int i=1; i 1.0f) ? 1.0f : z; // clamp to [0..1] + int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1)); + ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f); + if (in_dims == 2); + GGML_ASSERT( targets->n_dims == 3); + int n_tokens = tokens_input->ne[0]; + int n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_tokens == targets->ne[1]); + GGML_ASSERT(n_batch == targets->ne[2]); + + for (int k=0; kne[0], + k*tokens_input->nb[1]); + struct ggml_tensor * targets_k = ggml_view_2d(ctx, + targets, + targets->ne[0], + targets->ne[1], + targets->nb[1], + k*targets->nb[2]); + get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k); + } +} + +void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + for (int i=0; i 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } @@ -3105,12 +3107,12 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } -inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { - const uint16_t * i16 = (const uint16_t *) x; - for (int i = 0; i < n; ++i) { - y[i] = table_silu_f16[i16[i]]; - } -} +//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +// const uint16_t * i16 = (const uint16_t *) x; +// for (int i = 0; i < n; ++i) { +// y[i] = table_silu_f16[i16[i]]; +// } +//} #ifdef GGML_SILU_FP16 inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { @@ -3129,6 +3131,29 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { } #endif +inline static float ggml_silu_backward_f32(float x, float dy) { + const float s = 1.0f/(1.0f + expf(-x)); + return dy*s*(1.0f + x*(1.0f - s)); +} + +#ifdef GGML_SILU_FP16 +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { + for (int i = 0; i < n; ++i) { + // we did not use x[i] to compute forward silu but its f16 equivalent + // take derivative at f16 of x[i]: + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + float usedx = GGML_FP16_TO_FP32(fp16); + dx[i] = ggml_silu_backward_f32(usedx, dy[i]); + } +} +#else +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { + for (int i = 0; i < n; ++i) { + dx[i] = ggml_silu_backward_f32(x[i], dy[i]); + } +} +#endif + inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE ggml_float sum = 0.0; @@ -3260,12 +3285,16 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "DUP", "ADD", + "ADD1", + "ACC", "SUB", "MUL", "DIV", "SQR", "SQRT", + "LOG", "SUM", + "SUM_ROWS", "MEAN", "REPEAT", "ABS", @@ -3275,12 +3304,15 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "RELU", "GELU", "SILU", + "SILU_BACK", "NORM", "RMS_NORM", + "RMS_NORM_BACK", "MUL_MAT", "SCALE", + "SET", "CPY", "CONT", "RESHAPE", @@ -3288,9 +3320,13 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "PERMUTE", "TRANSPOSE", "GET_ROWS", + "GET_ROWS_BACK", + "DIAG", "DIAG_MASK_INF", + "DIAG_MASK_ZERO", "SOFT_MAX", "ROPE", + "ROPE_BACK", "ALIBI", "CONV_1D_1S", "CONV_1D_2S", @@ -3302,19 +3338,23 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39"); +static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", "x", "x+y", + "x+y", + "view(x,nb,offset)+=y->x", "x-y", "x*y", "x/y", "x^2", "√x", + "log(x)", "Σx", + "Σx_k", "Σx/n", "repeat(x)", "abs(x)", @@ -3324,12 +3364,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "relu(x)", "gelu(x)", "silu(x)", + "silu_back(x)", "norm(x)", "rms_norm(x)", + "rms_norm_back(x)", "X*Y", "x*v", + "y-\\>view(x)", "x-\\>y", "cont(x)", "reshape(x)", @@ -3337,9 +3380,13 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "permute(x)", "transpose(x)", "get_rows(x)", + "get_rows_back(x)", + "diag(x)", "diag_mask_inf(x)", + "diag_mask_zero(x)", "soft_max(x)", "rope(x)", + "rope_back(x)", "alibi(x)", "conv_1d_1s(x)", "conv_1d_2s(x)", @@ -3351,7 +3398,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39"); +static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -3589,9 +3636,9 @@ static inline int ggml_up32(int n) { return (n + 31) & ~31; } -static inline int ggml_up64(int n) { - return (n + 63) & ~63; -} +//static inline int ggml_up64(int n) { +// return (n + 63) & ~63; +//} static inline int ggml_up(int n, int m) { // assert m is a power of 2 @@ -4301,6 +4348,107 @@ struct ggml_tensor * ggml_add_inplace( return ggml_add_impl(ctx, a, b, true); } +// ggml_add1 + +struct ggml_tensor * ggml_add1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_is_scalar(b)); + GGML_ASSERT(ggml_is_padded_1d(a)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, true); +} + +// ggml_acc + +struct ggml_tensor * ggml_acc_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { + GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); + ((int32_t *) c->data)[0] = nb1; + ((int32_t *) c->data)[1] = nb2; + ((int32_t *) c->data)[2] = nb3; + ((int32_t *) c->data)[3] = offset; + ((int32_t *) c->data)[4] = inplace ? 1 : 0; + + result->op = GGML_OP_ACC; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = c; + + return result; +} + +struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); +} + +struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); +} + // ggml_sub struct ggml_tensor * ggml_sub_impl( @@ -4494,6 +4642,41 @@ struct ggml_tensor * ggml_sqrt_inplace( return ggml_sqrt_impl(ctx, a, true); } + +// ggml_log + +struct ggml_tensor * ggml_log_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_LOG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, true); +} + // ggml_sum struct ggml_tensor * ggml_sum( @@ -4515,6 +4698,33 @@ struct ggml_tensor * ggml_sum( return result; } + +// ggml_sum_rows + +struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + int64_t ne[4] = {1,1,1,1}; + for (int i=1; in_dims; ++i) { + ne[i] = a->ne[i]; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + + result->op = GGML_OP_SUM_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + // ggml_mean struct ggml_tensor * ggml_mean( @@ -4805,6 +5015,29 @@ struct ggml_tensor * ggml_silu_inplace( return ggml_silu_impl(ctx, a, true); } +// ggml_silu_back + +struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad || b->grad) { + // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SILU_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_norm struct ggml_tensor * ggml_norm_impl( @@ -4847,7 +5080,6 @@ struct ggml_tensor * ggml_rms_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -4873,6 +5105,28 @@ struct ggml_tensor * ggml_rms_norm_inplace( return ggml_rms_norm_impl(ctx, a, true); } +struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad) { + // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_RMS_NORM_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + + // ggml_mul_mat struct ggml_tensor * ggml_mul_mat( @@ -4912,13 +5166,10 @@ struct ggml_tensor * ggml_scale_impl( bool is_node = false; if (!inplace && (a->grad || b->grad)) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } - // TODO: when implement backward, fix this: - //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); result->op = GGML_OP_SCALE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4942,6 +5193,100 @@ struct ggml_tensor * ggml_scale_inplace( return ggml_scale_impl(ctx, a, b, true); } +// ggml_set + +struct ggml_tensor * ggml_set_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { + GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + // make a view of the destination + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); + (( int32_t * ) c->data)[0] = nb1; + (( int32_t * ) c->data)[1] = nb2; + (( int32_t * ) c->data)[2] = nb3; + (( int32_t * ) c->data)[3] = offset; + (( int32_t * ) c->data)[4] = inplace ? 1 : 0; + + result->op = GGML_OP_SET; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = c; + + return result; +} + +struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); +} + +struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); +} + +struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); +} + +struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); +} + +struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); +} + +struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); +} + + // ggml_cpy struct ggml_tensor * ggml_cpy_impl( @@ -4954,7 +5299,6 @@ struct ggml_tensor * ggml_cpy_impl( bool is_node = false; if (!inplace && (a->grad || b->grad)) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -4992,7 +5336,6 @@ struct ggml_tensor * ggml_cont_impl( bool is_node = false; if (!inplace && a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5030,11 +5373,15 @@ struct ggml_tensor * ggml_reshape( bool is_node = false; - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + if (a->grad) { is_node = true; } + if (b->grad) { + // gradient propagation is not supported + //GGML_ASSERT(false); + } + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); result->op = GGML_OP_RESHAPE; @@ -5045,6 +5392,30 @@ struct ggml_tensor * ggml_reshape( return result; } +struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[1] = { ne0 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + struct ggml_tensor * ggml_reshape_2d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -5056,7 +5427,6 @@ struct ggml_tensor * ggml_reshape_2d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5083,7 +5453,6 @@ struct ggml_tensor * ggml_reshape_3d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5098,6 +5467,34 @@ struct ggml_tensor * ggml_reshape_3d( return result; } + +struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + // ggml_view_1d struct ggml_tensor * ggml_view_1d( @@ -5105,16 +5502,23 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * a, int64_t ne0, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(offset)); + } return result; } @@ -5128,8 +5532,11 @@ struct ggml_tensor * ggml_view_2d( int64_t ne1, size_t nb1, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; @@ -5141,9 +5548,13 @@ struct ggml_tensor * ggml_view_2d( result->nb[3] = result->nb[2]; result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(offset)); + } return result; } @@ -5159,8 +5570,11 @@ struct ggml_tensor * ggml_view_3d( size_t nb1, size_t nb2, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; @@ -5172,9 +5586,53 @@ struct ggml_tensor * ggml_view_3d( result->nb[3] = result->nb[2]*ne2; result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(offset)); + } + + return result; +} + +// ggml_view_4d + +struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = nb3; + + result->op = GGML_OP_VIEW; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(offset)); + } return result; } @@ -5203,7 +5661,6 @@ struct ggml_tensor * ggml_permute( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5235,7 +5692,14 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the permutation here? + result->src1 = NULL; + + if (is_node) { + result->padding[0] = axis0; + result->padding[1] = axis1; + result->padding[2] = axis2; + result->padding[3] = axis3; + } return result; } @@ -5248,7 +5712,6 @@ struct ggml_tensor * ggml_transpose( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5279,7 +5742,6 @@ struct ggml_tensor * ggml_get_rows( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5295,24 +5757,76 @@ struct ggml_tensor * ggml_get_rows( return result; } -// ggml_diag_mask_inf +// ggml_get_rows_back -struct ggml_tensor * ggml_diag_mask_inf( +struct ggml_tensor * ggml_get_rows_back( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past) { + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); + bool is_node = false; - if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + if (a->grad || b->grad) { is_node = true; } - // TODO: when implement backward, fix this: - //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_view_tensor(ctx, a); - struct ggml_tensor * b = ggml_new_i32(ctx, n_past); - ggml_set_name(b, "n_past"); + // TODO: implement non F32 return + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + + result->op = GGML_OP_GET_ROWS_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = c; + + return result; +} + +// ggml_diag + +struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(a->ne[1] == 1); + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + + result->op = GGML_OP_DIAG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + + +// ggml_diag_mask_inf + +struct ggml_tensor * ggml_diag_mask_inf_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + bool inplace) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ((int32_t *) b->data)[0] = n_past; + ((int32_t *) b->data)[1] = inplace ? 1 : 0; result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5322,21 +5836,75 @@ struct ggml_tensor * ggml_diag_mask_inf( return result; } -// ggml_soft_max - -struct ggml_tensor * ggml_soft_max( +struct ggml_tensor * ggml_diag_mask_inf( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_inf_impl(ctx, a, n_past, false); +} + + +struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_inf_impl(ctx, a, n_past, true); +} + +// ggml_diag_mask_zero + +struct ggml_tensor * ggml_diag_mask_zero_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + bool inplace) { bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } - // TODO: when implement backward, fix this: - //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(b, "n_past, inplace"); + ((int32_t *) b->data)[0] = n_past; + ((int32_t *) b->data)[1] = inplace ? 1 : 0; + + result->op = GGML_OP_DIAG_MASK_ZERO; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_zero_impl(ctx, a, n_past, false); +} + +struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_zero_impl(ctx, a, n_past, true); +} + +// ggml_soft_max + +struct ggml_tensor * ggml_soft_max_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5346,14 +5914,75 @@ struct ggml_tensor * ggml_soft_max( return result; } +struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, true); +} + // ggml_rope +struct ggml_tensor * ggml_rope_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + bool inplace) { + GGML_ASSERT(n_past >= 0); + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + ((int32_t *) b->data)[0] = n_past; + ((int32_t *) b->data)[1] = n_dims; + ((int32_t *) b->data)[2] = mode; + + result->op = GGML_OP_ROPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, int n_past, int n_dims, int mode) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false); +} + +struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true); +} + +// ggml_rope_back + +struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -5362,9 +5991,7 @@ struct ggml_tensor * ggml_rope( is_node = true; } - // TODO: when implement backward, fix this: - //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); ((int32_t *) b->data)[0] = n_past; @@ -5372,7 +5999,7 @@ struct ggml_tensor * ggml_rope( ((int32_t *) b->data)[2] = mode; ggml_set_name(b, "n_past, n_dims, mode"); - result->op = GGML_OP_ROPE; + result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; @@ -5626,6 +6253,38 @@ void ggml_set_param( // ggml_compute_forward_dup +static void ggml_compute_forward_dup_same_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + GGML_ASSERT(src0->type == dst->type); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const size_t nb00 = src0->nb[0]; + const size_t nb0 = dst->nb[0]; + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + if (ie0 < ie1) { + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + } + +} static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -5660,17 +6319,7 @@ static void ggml_compute_forward_dup_f16( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - // parallelize by elements - const int ne = ggml_nelements(dst); - const int dr = (ne + nth - 1) / nth; - const int ie0 = dr * ith; - const int ie1 = MIN(ie0 + dr, ne); - - memcpy( - ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); - + ggml_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -5959,17 +6608,7 @@ static void ggml_compute_forward_dup_f32( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - // parallelize by elements - const int ne = ggml_nelements(dst); - const int dr = (ne + nth - 1) / nth; - const int ie0 = dr * ith; - const int ie1 = MIN(ie0 + dr, ne); - - memcpy( - ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); - + ggml_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -6224,6 +6863,10 @@ static void ggml_compute_forward_dup( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); + return; + } switch (src0->type) { case GGML_TYPE_F16: { @@ -6256,44 +6899,73 @@ static void ggml_compute_forward_add_f32( const int ith = params->ith; const int nth = params->nth; - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; const size_t nb10 = src1->nb[0]; const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; const size_t nb0 = dst->nb[0]; const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + if (nb10 == sizeof(float)) { - for (int j = ith; j < n; j += nth) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + #ifdef GGML_USE_ACCELERATE vDSP_vadd( - (float *) ((char *) src0->data + j*nb01), 1, - (float *) ((char *) src1->data + j*nb11), 1, - (float *) ((char *) dst->data + j*nb1), 1, nc); + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); #else - ggml_vec_add_f32(nc, - (float *) ((char *) dst->data + j*nb1), - (float *) ((char *) src0->data + j*nb01), - (float *) ((char *) src1->data + j*nb11)); + ggml_vec_add_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); #endif + // } + // } } } else { // src1 is not contiguous - for (int j = ith; j < n; j += nth) { - float * dst_ptr = (float *) ((char *) dst->data + j*nb1); - float * src0_ptr = (float *) ((char *) src0->data + j*nb01); - for (int i = 0; i < nc; i++) { - float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - dst_ptr[i] = src0_ptr[i] + *src1_ptr; + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; } } } @@ -6313,17 +6985,25 @@ static void ggml_compute_forward_add_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; const size_t nb10 = src1->nb[0]; const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; const size_t nb0 = dst->nb[0]; const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -6332,13 +7012,26 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + if (nb10 == sizeof(float)) { - for (int j = ith; j < n; j += nth) { - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); - for (int i = 0; i < nc; i++) { - float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); } } } @@ -6362,32 +7055,53 @@ static void ggml_compute_forward_add_f16_f16( const int ith = params->ith; const int nth = params->nth; - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; const size_t nb10 = src1->nb[0]; const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; const size_t nb0 = dst->nb[0]; const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + if (nb10 == sizeof(ggml_fp16_t)) { - for (int j = ith; j < n; j += nth) { - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); - for (int i = 0; i < nc; i++) { - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); } } } @@ -6408,50 +7122,36 @@ static void ggml_compute_forward_add_q_f32( return; } + const int nr = ggml_nrows(src0); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; + //const int64_t ne03 = src0->ne[3]; - //const int64_t ne10 = src1->ne[0]; - //const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - const enum ggml_type type = src0->type; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -6463,9 +7163,6 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src1->type == GGML_TYPE_F32); - // total rows in src0 - const int nr = ne01*ne02*ne03; - // rows per thread const int dr = (nr + nth - 1)/nth; @@ -6542,6 +7239,428 @@ static void ggml_compute_forward_add( } } +// ggml_compute_forward_add1 + +static void ggml_compute_forward_add1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_add1_f32); + + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) src1->data), 0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_add1_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + *(float *) src1->data); +#endif + } +} + +static void ggml_compute_forward_add1_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 + GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); + void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); + + assert(ne0 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne0); + // add src1 + ggml_vec_acc1_f32(ne0, wdata, v); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne0); + } +} + +static void ggml_compute_forward_add1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add1_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + { + ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + +// ggml_compute_forward_acc + +static void ggml_compute_forward_acc_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + GGML_ASSERT(opt0->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(opt0) == 5); + + // view src0 and dst with these strides and data offset inbytes during acc + // nb0 is implicitely element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) opt0->data)[0]; + size_t nb2 = ((int32_t *) opt0->data)[1]; + size_t nb3 = ((int32_t *) opt0->data)[2]; + size_t offset = ((int32_t *) opt0->data)[3]; + bool inplace = (bool) ((int32_t *) opt0->data)[4]; + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + // src0 and dst as viewed during acc + const size_t nb0 = ggml_element_size(src0); + + const size_t nb00 = nb0; + const size_t nb01 = nb1; + const size_t nb02 = nb2; + const size_t nb03 = nb3; + + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); +#else + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + } +} + +static void ggml_compute_forward_acc( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_sub static void ggml_compute_forward_sub_f32( @@ -6556,18 +7675,68 @@ static void ggml_compute_forward_sub_f32( return; } - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - assert(src1->nb[0] == sizeof(float)); + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; - for (int i = 0; i < n; i++) { - ggml_vec_sub_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + +#ifdef GGML_USE_ACCELERATE + vDSP_vsub( + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_sub_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] - *src1_ptr; + } + } } } @@ -6602,18 +7771,70 @@ static void ggml_compute_forward_mul_f32( return; } - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - assert(src1->nb[0] == sizeof(float)); + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; - for (int i = 0; i < n; i++) { - ggml_vec_mul_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_mul_f32); + + vDSP_vmul( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_mul_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); + } + } } } @@ -6648,18 +7869,68 @@ static void ggml_compute_forward_div_f32( return; } - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - assert(src1->nb[0] == sizeof(float)); + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; - for (int i = 0; i < n; i++) { - ggml_vec_div_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + +#ifdef GGML_USE_ACCELERATE + vDSP_vdiv( + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_div_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); + } + } } } @@ -6764,6 +8035,49 @@ static void ggml_compute_forward_sqrt( } } + +// ggml_compute_forward_log + +static void ggml_compute_forward_log_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_log_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_log( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_log_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_sum static void ggml_compute_forward_sum_f32( @@ -6821,6 +8135,73 @@ static void ggml_compute_forward_sum( } } +// ggml_compute_forward_sum_rows + +static void ggml_compute_forward_sum_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + GGML_ASSERT(ne0 == 1); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + for (int64_t i3 = 0; i3 < ne03; i3++) { + for (int64_t i2 = 0; i2 < ne02; i2++) { + for (int64_t i1 = 0; i1 < ne01; i1++) { + float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float row_sum = 0; + ggml_vec_sum_f32(ne00, &row_sum, src_row); + dst_row[0] = row_sum; + } + } + } +} + +static void ggml_compute_forward_sum_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_rows_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_mean static void ggml_compute_forward_mean_f32( @@ -6898,37 +8279,58 @@ static void ggml_compute_forward_repeat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - assert(params->ith == 0); - assert(ggml_can_repeat(src0, dst)); + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - // TODO: implement support for rank > 2 tensors - assert(src0->ne[2] == 1); - assert(src0->ne[3] == 1); - assert( dst->ne[2] == 1); - assert( dst->ne[3] == 1); + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; - const int nc = dst->ne[0]; - const int nr = dst->ne[1]; - const int nc0 = src0->ne[0]; - const int nr0 = src0->ne[1]; - const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat - const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); // TODO: support for transposed / permuted tensors - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); // TODO: maybe this is not optimal? - for (int i = 0; i < nrr; i++) { - for (int j = 0; j < ncr; j++) { - for (int k = 0; k < nr0; k++) { - ggml_vec_cpy_f32(nc0, - (float *) ((char *) dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])), - (float *) ((char *) src0->data + ( k)*(src0->nb[1]))); + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_cpy_f32(ne00, + (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); + } + } + } + } } } } @@ -7281,6 +8683,70 @@ static void ggml_compute_forward_silu( } +// ggml_compute_forward_silu_back + +static void ggml_compute_forward_silu_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(grad)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src0, grad)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_silu_backward_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1])), + (float *) ((char *) grad->data + i1*(grad->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_silu_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -7435,6 +8901,195 @@ static void ggml_compute_forward_rms_norm( } +static void ggml_compute_forward_rms_norm_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const float eps = 1e-6f; // TODO: make this a parameter + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + // src1 is same shape as src0 => same indices + const int64_t i11 = i01; + const int64_t i12 = i02; + const int64_t i13 = i03; + + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + + ggml_float sum_xx = 0.0; + ggml_float sum_xdz = 0.0; + + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum_xx += (ggml_float)(x[i00] * x[i00]); + sum_xdz += (ggml_float)(x[i00] * dz[i00]); + } + + //const float mean = (float)(sum_xx)/ne00; + const float mean_eps = (float)(sum_xx)/ne00 + eps; + const float sum_eps = (float)(sum_xx) + eps*ne00; + //const float mean_xdz = (float)(sum_xdz)/ne00; + // we could cache rms from forward pass to improve performance. + // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. + //const float rms = sqrtf(mean_eps); + const float rrms = 1.0f / sqrtf(mean_eps); + //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) + + { + // z = rms_norm(x) + // + // rms_norm(src0) = + // scale( + // src0, + // div( + // 1, + // sqrt( + // add( + // scale( + // sum( + // sqr( + // src0)), + // (1.0/N)), + // eps)))); + + // postorder: + // ## op args grad + // 00 param src0 grad[#00] + // 01 const 1 + // 02 sqr (#00) grad[#02] + // 03 sum (#02) grad[#03] + // 04 const 1/N + // 05 scale (#03, #04) grad[#05] + // 06 const eps + // 07 add (#05, #06) grad[#07] + // 08 sqrt (#07) grad[#08] + // 09 div (#01,#08) grad[#09] + // 10 scale (#00,#09) grad[#10] + // + // backward pass, given grad[#10] + // #10: scale + // grad[#00] += scale(grad[#10],#09) + // grad[#09] += sum(mul(grad[#10],#00)) + // #09: div + // grad[#08] += neg(mul(grad[#09], div(#09,#08))) + // #08: sqrt + // grad[#07] += mul(grad[#08], div(0.5, #08)) + // #07: add + // grad[#05] += grad[#07] + // #05: scale + // grad[#03] += scale(grad[#05],#04) + // #03: sum + // grad[#02] += repeat(grad[#03], #02) + // #02: + // grad[#00] += scale(mul(#00, grad[#02]), 2.0) + // + // substitute and simplify: + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#02] = repeat(grad[#03], #02) + // grad[#02] = repeat(scale(grad[#05],#04), #02) + // grad[#02] = repeat(scale(grad[#07],#04), #02) + // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps)) + // a = b*c + d*e + // a = b*c*f/f + d*e*f/f + // a = (b*c*f + d*e*f)*(1/f) + // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c)) + // a = (b + d*e/c)*c + // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms + // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms + // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms + // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms + // a = (dz + x*div(-mean_xdz,mean_eps))*rrms + // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms) + // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + } + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // post-order: + // dx := x + // dx := scale(dx,-mean_xdz/mean_eps) + // dx := add(dx, dz) + // dx := scale(dx, rrms) + float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_vec_cpy_f32 (ne00, dx, x); + // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); + ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); + ggml_vec_acc_f32 (ne00, dx, dz); + ggml_vec_scale_f32(ne00, dx, rrms); + } + } + } +} + +static void ggml_compute_forward_rms_norm_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + // ggml_compute_forward_mul_mat #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) @@ -8137,8 +9792,17 @@ static void ggml_compute_forward_scale_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); + const size_t nb01 = src0->nb[1]; + + const size_t nb1 = dst->nb[1]; + + for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v); + if (dst->data != src0->data) { + // src0 is same shape as dst => same indices + memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + } + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } } @@ -8159,6 +9823,115 @@ static void ggml_compute_forward_scale( } } +// ggml_compute_forward_set + +static void ggml_compute_forward_set_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + GGML_ASSERT(opt0->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(opt0) == 5); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitely element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) opt0->data)[0]; + size_t nb2 = ((int32_t *) opt0->data)[1]; + size_t nb3 = ((int32_t *) opt0->data)[2]; + size_t offset = ((int32_t *) opt0->data)[3]; + bool inplace = (bool) ((int32_t *) opt0->data)[4]; + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + const size_t nb12 = src1->nb[2]; + const size_t nb13 = src1->nb[3]; + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + +static void ggml_compute_forward_set( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_set_f32(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_cpy static void ggml_compute_forward_cpy( @@ -8353,22 +10126,210 @@ static void ggml_compute_forward_get_rows( //} } -// ggml_compute_forward_diag_mask_inf +// ggml_compute_forward_get_rows_back -static void ggml_compute_forward_diag_mask_inf_f32( +static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 1); + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + ggml_compute_forward_dup_same_cont(params, opt0, dst); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + for (int j = 0; j < nc; ++j) { + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + } + } +} + +static void ggml_compute_forward_get_rows_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) src0->data + i*src0->nb[1])); + } +} + + +static void ggml_compute_forward_get_rows_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + //static bool first = true; + //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); + //if (first) { + // first = false; + //} else { + // for (int k = 0; k < dst->ne[1]; ++k) { + // for (int j = 0; j < dst->ne[0]/16; ++j) { + // for (int i = 0; i < 16; ++i) { + // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // exit(0); + //} +} + +// ggml_compute_forward_diag + +static void ggml_compute_forward_diag_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; + GGML_ASSERT(ne00 == ne0); + GGML_ASSERT(ne00 == ne1); + GGML_ASSERT(ne01 == 1); + GGML_ASSERT(ne02 == ne2); + GGML_ASSERT(ne03 == ne3); + + const int nb00 = src0->nb[0]; + //const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = 0; i1 < ne1; i1++) { + float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); + for (int i0 = 0; i0 < i1; i0++) { + d[i0] = 0; + } + d[i1] = s[i1]; + for (int i0 = i1+1; i0 < ne0; i0++) { + d[i0] = 0; + } + } + } + } +} + +static void ggml_compute_forward_diag( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_diag_mask_inf + +static void ggml_compute_forward_diag_mask_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const float value) { + assert(params->ith == 0); + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 2); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n_past = ((int32_t *) src1->data)[0]; + const bool inplace = (bool)((int32_t *) src1->data)[1]; + + if (!inplace) { + ggml_compute_forward_dup_same_cont(params, src0, dst); + } // TODO: handle transposed/permuted matrices @@ -8384,7 +10345,7 @@ static void ggml_compute_forward_diag_mask_inf_f32( for (int j = 0; j < nr; j++) { for (int i = n_past; i < nc; i++) { if (i > n_past + j) { - *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY; + *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; } } } @@ -8399,7 +10360,24 @@ static void ggml_compute_forward_diag_mask_inf( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst); + ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_diag_mask_zero( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); } break; default: { @@ -8438,44 +10416,44 @@ static void ggml_compute_forward_soft_max_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - float *p = (float *)((char *) dst->data + i1*dst->nb[1]); + float *sp = (float *)((char *) src0->data + i1*src0->nb[1]); + float *dp = (float *)((char *) dst->data + i1*dst->nb[1]); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(p[i])); + assert(!isnan(sp[i])); } #endif float max = -INFINITY; - ggml_vec_max_f32(nc, &max, p); + ggml_vec_max_f32(nc, &max, sp); ggml_float sum = 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { - //printf("p[%3d] = %8.4f\n", i, p[i]); - if (p[i] == -INFINITY) { - p[i] = 0.0f; + if (sp[i] == -INFINITY) { + dp[i] = 0.0f; } else { - //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); + // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); sum += (ggml_float)val; - p[i] = val; + dp[i] = val; } } assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(nc, p, sum); + ggml_vec_scale_f32(nc, dp, sum); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - assert(!isnan(p[i])); - assert(!isinf(p[i])); + assert(!isnan(dp[i])); + assert(!isinf(dp[i])); } #endif } @@ -8658,8 +10636,8 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -8682,12 +10660,16 @@ static void ggml_compute_forward_rope_f32( //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - assert(nb0 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); const int ith = params->ith; const int nth = params->nth; const int nr = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT(n_dims <= nc); + GGML_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8748,8 +10730,8 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -8772,12 +10754,16 @@ static void ggml_compute_forward_rope_f16( //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - assert(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); const int ith = params->ith; const int nth = params->nth; const int nr = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT(n_dims <= nc); + GGML_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8854,6 +10840,217 @@ static void ggml_compute_forward_rope( } } +// ggml_compute_forward_rope_back + +static void ggml_compute_forward_rope_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 3); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // y = rope(x, src1) + // dx = rope_back(dy, src1) + // src0 is dy, src1 contains options + + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + + //const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + const int nb0 = src0->nb[0]; + const int nb1 = src0->nb[1]; + const int nb2 = src0->nb[2]; + const int nb3 = src0->nb[3]; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + assert(nb0 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(10000.0, -2.0f/n_dims); + + const bool is_neox = mode & 2; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = (float)p; + + for (int i0 = 0; i0 < n_dims; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + if (!is_neox) { + const float * const dy = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = dy[0]; + const float dy1 = dy[1]; + + dx[0] = dy0*cos_theta + dy1*sin_theta; + dx[1] = - dy0*sin_theta + dy1*cos_theta; + } else { + const float * const dy = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + + const float dy0 = dy[0]; + const float dy1 = dy[n_dims/2]; + + dx[0] = dy0*cos_theta + dy1*sin_theta; + dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta; + } + } + } + } + } +} + +static void ggml_compute_forward_rope_back_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 3); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // y = rope(x, src1) + // dx = rope_back(dy, src1) + // src0 is dy, src1 contains options + + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + + //const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + const int nb0 = src0->nb[0]; + const int nb1 = src0->nb[1]; + const int nb2 = src0->nb[2]; + const int nb3 = src0->nb[3]; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + assert(nb0 == sizeof(ggml_fp16_t)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(10000.0, -2.0f/n_dims); + + const bool is_neox = mode & 2; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = (float)p; + + for (int i0 = 0; i0 < n_dims; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + if (!is_neox) { + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[1]); + + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + } else { + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]); + + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + } + } + } + } + } +} + +static void ggml_compute_forward_rope_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_conv_1d_1s static void ggml_compute_forward_conv_1d_1s_f16_f32( @@ -10173,6 +12370,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_ADD1: + { + ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_ACC: + { + ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + } break; case GGML_OP_SUB: { ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); @@ -10193,10 +12398,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_sqrt(params, tensor->src0, tensor); } break; + case GGML_OP_LOG: + { + ggml_compute_forward_log(params, tensor->src0, tensor); + } break; case GGML_OP_SUM: { ggml_compute_forward_sum(params, tensor->src0, tensor); } break; + case GGML_OP_SUM_ROWS: + { + ggml_compute_forward_sum_rows(params, tensor->src0, tensor); + } break; case GGML_OP_MEAN: { ggml_compute_forward_mean(params, tensor->src0, tensor); @@ -10233,6 +12446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_silu(params, tensor->src0, tensor); } break; + case GGML_OP_SILU_BACK: + { + ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_NORM: { ggml_compute_forward_norm(params, tensor->src0, tensor); @@ -10241,6 +12458,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rms_norm(params, tensor->src0, tensor); } break; + case GGML_OP_RMS_NORM_BACK: + { + ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_MUL_MAT: { ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); @@ -10249,6 +12470,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_SET: + { + ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + } break; case GGML_OP_CPY: { ggml_compute_forward_cpy(params, tensor->src0, tensor); @@ -10277,10 +12502,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_GET_ROWS_BACK: + { + ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + } break; + case GGML_OP_DIAG: + { + ggml_compute_forward_diag(params, tensor->src0, tensor); + } break; case GGML_OP_DIAG_MASK_INF: { ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_DIAG_MASK_ZERO: + { + ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_SOFT_MAX: { ggml_compute_forward_soft_max(params, tensor->src0, tensor); @@ -10289,6 +12526,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_ROPE_BACK: + { + ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_ALIBI: { ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); @@ -10357,6 +12598,48 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); } } break; + case GGML_OP_ADD1: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_add_impl(ctx, + src1->grad, + ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean + inplace); + } + } break; + case GGML_OP_ACC: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); + GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; + const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + + struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, + tensor->grad, + src1->grad->ne[0], + src1->grad->ne[1], + src1->grad->ne[2], + src1->grad->ne[3], + nb1, nb2, nb3, offset); + + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), + src1->grad), + inplace); + } + } break; case GGML_OP_SUB: { if (src0->grad) { @@ -10408,31 +12691,57 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_mul(ctx, + ggml_scale(ctx, ggml_mul(ctx, src0, tensor->grad), - ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)), + ggml_new_f32(ctx, 2.0f)), inplace); } } break; case GGML_OP_SQRT: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1 + ggml_div(ctx, + ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), + tensor)), + inplace); + } + } break; + case GGML_OP_LOG: { if (src0->grad) { src0->grad = ggml_add_impl(ctx, src0->grad, ggml_div(ctx, - ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), - tensor), + tensor->grad, + src0), inplace); } } break; case GGML_OP_SUM: + { + if (src0->grad) { + src0->grad = + ggml_add1_impl(ctx, + src0->grad, + tensor->grad, + inplace); + } + } break; + case GGML_OP_SUM_ROWS: { if (src0->grad) { src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_repeat(ctx, tensor->grad, src0->grad), + ggml_repeat(ctx, + tensor->grad, + src0->grad), inplace); } } break; @@ -10442,11 +12751,44 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_REPEAT: { + // necessary for llama if (src0->grad) { + GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2); + const int nc = tensor->ne[0]; + const int nr = tensor->ne[1]; + const int nc0 = src0->ne[0]; + const int nr0 = src0->ne[1]; + const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat + const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat + // tensor->grad [nc,nr,1,1] + // reshape [nc0,nc/nc0,nr0,nr/nr0] + // permute [nc0,nr0,nc/nc0,nr/nr0] + // substitute [nc0,nr0,ncr,nrr] + // reshape [nc0*nr0,ncr*nrr,1,1] + // transpose [ncr*nrr,nc0*nr0,1,1] + // sum rows [1,nc0*nr0,1,1] + // transpose [nc0*nr0,1,1] + // reshape [nc0,nr0,1,1] reshape_1d or reshape_2d + // add to src0->grad + + int64_t ne[4] = {nc0,ncr,nr0,nrr}; + + struct ggml_tensor* F00 = tensor->grad; + struct ggml_tensor* F01 = ggml_reshape (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne)); + struct ggml_tensor* F02 = ggml_permute (ctx, F01, 0,2,1,3); + struct ggml_tensor* F03 = ggml_cont (ctx, F02); + struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr); + struct ggml_tensor* F05 = ggml_transpose (ctx, F04); + struct ggml_tensor* F06 = ggml_cont (ctx, F05); + struct ggml_tensor* F07 = ggml_sum_rows (ctx, F06); + struct ggml_tensor* F08 = ggml_transpose (ctx, F07); + struct ggml_tensor* F09 = ggml_cont (ctx, F08); + struct ggml_tensor* F10 = ggml_reshape (ctx, F09, src0->grad); + src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_sum(ctx, tensor->grad), + F10, inplace); } } break; @@ -10500,6 +12842,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_SILU: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_silu_back(ctx, src0, tensor->grad), + inplace); + } + } break; + case GGML_OP_SILU_BACK: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -10508,68 +12860,372 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_RMS_NORM: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rms_norm_back(ctx, src0, tensor->grad), + inplace); + } + } break; + case GGML_OP_RMS_NORM_BACK: { GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_MUL_MAT: { + // https://cs231n.github.io/optimization-2/#staged + // # forward pass + // s0 = np.random.randn(5, 10) + // s1 = np.random.randn(10, 3) + // t = s0.dot(s1) + + // # now suppose we had the gradient on t from above in the circuit + // dt = np.random.randn(*t.shape) # same shape as t + // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix + // ds1 = t.T.dot(dt) + + // tensor.shape [m,p] + // src0.shape [n,m] + // src1.shape [n,p] + + // necessary for llama if (src0->grad) { // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); - GGML_ASSERT(false); + src0->grad = + ggml_add_impl(ctx, + src0->grad, + // ds0 = dt.dot(s1.T) + // ggml_out_prod(ctx, // [n,m] + // src1, // [n,p] + // tensor->grad), // [m,p] + // for now just using A*B==(B.T*A.T).T + ggml_cont(ctx, // [n,m] + ggml_transpose(ctx, // [n,m] + ggml_mul_mat(ctx, // [m,n] + ggml_cont(ctx, // [p,m] + ggml_transpose(ctx, // [p,m] + tensor->grad)), // [m,p] + ggml_cont(ctx, // [p,n] + ggml_transpose(ctx, // [p,n] + src1))))), // [n,p] + inplace); } if (src1->grad) { src1->grad = ggml_add_impl(ctx, src1->grad, - ggml_mul_mat(ctx, - ggml_cont(ctx, ggml_transpose(ctx, src0)), - tensor->grad), + // ds1 = s0.T.dot(dt): + ggml_mul_mat(ctx, // [n,p] + ggml_cont(ctx, // [m,n] + ggml_transpose(ctx, src0)), // [m,n] + tensor->grad), // [m,p] inplace); } } break; case GGML_OP_SCALE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_scale_impl(ctx, tensor->grad, src1, false), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + inplace); + } + } break; + case GGML_OP_SET: + { + GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); + GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; + const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + + struct ggml_tensor * tensor_grad_view = NULL; + + if (src0->grad || src1->grad) { + GGML_ASSERT(src0->type == tensor->type); + GGML_ASSERT(tensor->grad->type == tensor->type); + GGML_ASSERT(tensor->grad->type == src1->grad->type); + + tensor_grad_view = ggml_view_4d(ctx, + tensor->grad, + src1->grad->ne[0], + src1->grad->ne[1], + src1->grad->ne[2], + src1->grad->ne[3], + nb1, nb2, nb3, offset); + } + + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_acc_impl(ctx, + tensor->grad, + ggml_neg(ctx, tensor_grad_view), + nb1, nb2, nb3, offset, false), + inplace); + } + + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), + src1->grad), + inplace); + } } break; case GGML_OP_CPY: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + // cpy overwrites value of src1 by src0 and returns view(src1) + // the overwriting is mathematically equivalent to: + // tensor = src0 * 1 + src1 * 0 + if (src0->grad) { + // dsrc0 = dtensor * 1 + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + // dsrc1 = dtensor * 0 -> noop + } } break; case GGML_OP_CONT: { - GGML_ASSERT(false); // TODO: not implemented + // same as cpy + if (src0->grad) { + GGML_ASSERT(ggml_is_contiguous(src0->grad)); + GGML_ASSERT(ggml_is_contiguous(tensor->grad)); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } } break; case GGML_OP_RESHAPE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_reshape(ctx, tensor->grad, src0->grad), + inplace); + } } break; case GGML_OP_VIEW: { - GGML_ASSERT(false); // not supported + // necessary for llama + if (src0->grad) { + size_t offset; + memcpy(&offset, tensor->padding, sizeof(offset)); + + size_t nb1 = tensor->nb[1]; + size_t nb2 = tensor->nb[2]; + size_t nb3 = tensor->nb[3]; + + if (src0->type != src0->grad->type) { + // gradient is typically F32, but src0 could be other type + size_t ng = ggml_element_size(src0->grad); + size_t n0 = ggml_element_size(src0); + GGML_ASSERT(offset % n0 == 0); + GGML_ASSERT(nb1 % n0 == 0); + GGML_ASSERT(nb2 % n0 == 0); + GGML_ASSERT(nb3 % n0 == 0); + offset = (offset / n0) * ng; + nb1 = (nb1 / n0) * ng; + nb2 = (nb2 / n0) * ng; + nb3 = (nb3 / n0) * ng; + } + + src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + } } break; case GGML_OP_PERMUTE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + int axis0 = tensor->padding[0] & 0x3; + int axis1 = tensor->padding[1] & 0x3; + int axis2 = tensor->padding[2] & 0x3; + int axis3 = tensor->padding[3] & 0x3; + int axes_backward[4] = {0,0,0,0}; + axes_backward[axis0] = 0; + axes_backward[axis1] = 1; + axes_backward[axis2] = 2; + axes_backward[axis3] = 3; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_permute(ctx, + tensor->grad, + axes_backward[0], + axes_backward[1], + axes_backward[2], + axes_backward[3]), + inplace); + } } break; case GGML_OP_TRANSPOSE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_transpose(ctx, tensor->grad), + inplace); + } } break; case GGML_OP_GET_ROWS: + { + // necessary for llama (only for tokenizer) + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), + inplace); + } + if (src1->grad) { + // noop + } + } break; + case GGML_OP_GET_ROWS_BACK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_DIAG: { GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_DIAG_MASK_INF: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 2); + const int n_past = ((int32_t *) src1->data)[0]; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + inplace); + } + if (src1->grad) { + // noop + } + } break; + case GGML_OP_DIAG_MASK_ZERO: + { + // necessary for llama + if (src0->grad) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 2); + const int n_past = ((int32_t *) src1->data)[0]; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + inplace); + } + if (src1->grad) { + // noop + } } break; case GGML_OP_SOFT_MAX: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + // y = softmax(x) + // + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.*y + // dx = J * dy + // dxk = sum(Jkj * dyk) + + int64_t ne2[4] = { + tensor->ne[0], + 1, + tensor->ne[1]*tensor->ne[2], + tensor->ne[3] + }; + struct ggml_tensor * tensor2 = ggml_cont(ctx, + ggml_reshape_4d(ctx, + ggml_cont(ctx, tensor), + ne2[0], ne2[1], ne2[2], ne2[3])); + + struct ggml_tensor * grad2 = ggml_cont(ctx, + ggml_reshape_4d(ctx, + ggml_cont(ctx, tensor->grad), + ne2[0], ne2[1], ne2[2], ne2[3])); + + struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3] + ggml_permute(ctx, // [1,ne0,ne1*ne2,ne3] + tensor2, // [ne0,1,ne1*ne2,ne3] + 1, 0, 2, 3)); + + src0->grad = + ggml_add_impl(ctx, + src0->grad, // [ne0,ne1,ne2,ne3] + ggml_reshape(ctx, // [ne0,ne1,ne2,ne3] + ggml_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3] + ggml_sub(ctx, // [ne0,ne0,ne1*ne2,ne3] + ggml_diag(ctx, // [ne0,ne0,ne1*ne2,ne3] + tensor2), // [ne0,1,ne1*ne2,ne3] + ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3] + tensor2_t, // [1,ne0,ne1*ne2,ne3] + tensor2_t)), // [1,ne0,ne1*ne2,ne3] + grad2), // [ne0,1,ne1*ne2,ne3] + src0->grad), + inplace); + } } break; case GGML_OP_ROPE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 3); + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rope_back(ctx, + tensor->grad, + n_past, + n_dims, + mode), + inplace); + } + if (src1->grad) { + // noop + } + } break; + case GGML_OP_ROPE_BACK: + { + if (src0->grad) { + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 3); + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rope(ctx, + tensor->grad, + n_past, + n_dims, + mode), + inplace); + } + if (src1->grad) { + // noop + } } break; case GGML_OP_CONV_1D_1S: { @@ -10927,6 +13583,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) work_size = MAX(work_size, cur); } break; case GGML_OP_ADD: + case GGML_OP_ADD1: { node->n_tasks = n_threads; @@ -10936,6 +13593,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + } + work_size = MAX(work_size, cur); } break; case GGML_OP_SUB: @@ -10943,7 +13612,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_DIV: case GGML_OP_SQR: case GGML_OP_SQRT: + case GGML_OP_LOG: case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_REPEAT: case GGML_OP_ABS: @@ -10962,8 +13633,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = n_threads; } break; + case GGML_OP_SILU_BACK: + { + node->n_tasks = n_threads; + } break; case GGML_OP_NORM: case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: { node->n_tasks = n_threads; } break; @@ -11029,21 +13705,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = n_threads; } break; + case GGML_OP_SET: case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_INF: + case GGML_OP_DIAG_MASK_ZERO: { node->n_tasks = 1; } break; case GGML_OP_SOFT_MAX: - { - node->n_tasks = n_threads; - } break; case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { node->n_tasks = n_threads; } break; @@ -12180,7 +14858,7 @@ enum ggml_opt_result ggml_opt( // build forward + backward compute graphs struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, false); + struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true); switch (params.type) { case GGML_OPT_ADAM: diff --git a/ggml.h b/ggml.h index bb9a025..2745fb3 100644 --- a/ggml.h +++ b/ggml.h @@ -192,7 +192,7 @@ #define GGML_MAX_DIMS 4 #define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 16 +#define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 #define GGML_DEFAULT_N_THREADS 4 @@ -262,12 +262,16 @@ extern "C" { GGML_OP_DUP, GGML_OP_ADD, + GGML_OP_ADD1, + GGML_OP_ACC, GGML_OP_SUB, GGML_OP_MUL, GGML_OP_DIV, GGML_OP_SQR, GGML_OP_SQRT, + GGML_OP_LOG, GGML_OP_SUM, + GGML_OP_SUM_ROWS, GGML_OP_MEAN, GGML_OP_REPEAT, GGML_OP_ABS, @@ -277,12 +281,15 @@ extern "C" { GGML_OP_RELU, GGML_OP_GELU, GGML_OP_SILU, + GGML_OP_SILU_BACK, GGML_OP_NORM, // normalize GGML_OP_RMS_NORM, + GGML_OP_RMS_NORM_BACK, GGML_OP_MUL_MAT, GGML_OP_SCALE, + GGML_OP_SET, GGML_OP_CPY, GGML_OP_CONT, GGML_OP_RESHAPE, @@ -290,9 +297,13 @@ extern "C" { GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, + GGML_OP_GET_ROWS_BACK, + GGML_OP_DIAG, GGML_OP_DIAG_MASK_INF, + GGML_OP_DIAG_MASK_ZERO, GGML_OP_SOFT_MAX, GGML_OP_ROPE, + GGML_OP_ROPE_BACK, GGML_OP_ALIBI, GGML_OP_CONV_1D_1S, GGML_OP_CONV_1D_2S, @@ -496,6 +507,29 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + GGML_API struct ggml_tensor * ggml_sub( struct ggml_context * ctx, struct ggml_tensor * a, @@ -519,12 +553,24 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // return scalar - // TODO: compute sum along rows GGML_API struct ggml_tensor * ggml_sum( struct ggml_context * ctx, struct ggml_tensor * a); + // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); + // mean along rows GGML_API struct ggml_tensor * ggml_mean( struct ggml_context * ctx, @@ -566,6 +612,13 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // normalize along rows // TODO: eps is hardcoded to 1e-5 for now GGML_API struct ggml_tensor * ggml_norm( @@ -576,6 +629,13 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // A: m rows, n columns // B: p rows, n columns (i.e. we transpose it internally) // result is m columns, p rows @@ -588,12 +648,66 @@ extern "C" { // operations on tensors without backpropagation // - // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + // a -> b, return view(b) GGML_API struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, @@ -614,6 +728,11 @@ extern "C" { // return view(a) // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + GGML_API struct ggml_tensor * ggml_reshape_2d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -629,6 +748,14 @@ extern "C" { int64_t ne1, int64_t ne2); + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + // offset in bytes GGML_API struct ggml_tensor * ggml_view_1d( struct ggml_context * ctx, @@ -654,6 +781,18 @@ extern "C" { size_t nb2, // slice stride in bytes size_t offset); + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + GGML_API struct ggml_tensor * ggml_permute( struct ggml_context * ctx, struct ggml_tensor * a, @@ -672,20 +811,50 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); + // set elements above the diagonal to -INF - // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_diag_mask_inf( struct ggml_context * ctx, struct ggml_tensor * a, int n_past); // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // set elements above the diagonal to 0 + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + GGML_API struct ggml_tensor * ggml_soft_max( struct ggml_context * ctx, struct ggml_tensor * a); - // rotary position embedding // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // rotary position embedding // if mode & 1 == 1, skip n_past elements // if mode & 2 == 1, GPT-NeoX style // TODO: avoid creating a new tensor every time @@ -696,6 +865,23 @@ extern "C" { int n_dims, int mode); + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode); + + // rotary position embedding backward, i.e compute dx from dy + // a - dy + GGML_API struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode); + // alibi position embedding // in-place, returns view(a) struct ggml_tensor * ggml_alibi( @@ -740,13 +926,13 @@ extern "C" { GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun); GGML_API struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun); // // automatic differentiation diff --git a/llama.cpp b/llama.cpp index e564de7..08c7352 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1128,8 +1128,8 @@ static bool llama_eval_internal( // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); ggml_set_name(Qcur, "Qcur"); ggml_set_name(Kcur, "Kcur"); @@ -1170,17 +1170,19 @@ static bool llama_eval_internal( struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); ggml_set_name(KQ_soft_max, "KQ_soft_max"); + // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -1281,7 +1283,7 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, -1); // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); + //inpL = ggml_soft_max_inplace(ctx0, inpL); // run the computation ggml_build_forward_expand(&gf, inpL); @@ -2375,7 +2377,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - BA = ggml_scale(lora_ctx, BA, scale_tensor); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); } ggml_tensor * r; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6456485..4171c12 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,3 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) +# llama_add_test(test-grad0.c) # SLOW +# llama_add_test(test-opt.c) # SLOW diff --git a/tests/test-grad0.c b/tests/test-grad0.c new file mode 100644 index 0000000..ec50592 --- /dev/null +++ b/tests/test-grad0.c @@ -0,0 +1,1131 @@ +#include "ggml.h" + +#include +#include +#include +#include + +#define MAX_NARGS 2 + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define GGML_SILU_FP16 + +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +float frand(void) { + return (float)rand()/(float)RAND_MAX; +} + +int irand(int n) { + if (n == 0) return 0; + else return rand()%n; +} + +void get_random_dims(int64_t * dims, int ndims) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = 1 + irand(4); + } +} + +struct ggml_tensor * get_random_tensor( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +struct ggml_tensor * get_random_tensor_int( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + int32_t imin, + int32_t imax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i0] = irand(imax - imin) + imin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +float get_element(const struct ggml_tensor * t, int idx) { + if (t->type == GGML_TYPE_F32) { + return ((float *)t->data)[idx]; + } else if (t->type == GGML_TYPE_I32) { + return ((int32_t *)t->data)[idx]; + } else { + assert(false); + return INFINITY; + } +} + +void set_element(struct ggml_tensor * t, int idx, float value) { + ((float *)t->data)[idx] = value; +} + +void print_elements(const char* label, const struct ggml_tensor * t) { + if (!t) { + printf("%s: %s = null\n", __func__, label); + return; + } + const int nelements = ggml_nelements(t); + printf("%s: %s = [", __func__, label); + for (int k = 0; k < nelements; ++k) { + if (k > 0) { printf(", "); } + printf("%.5f", get_element(t, k)); + } + printf("] shape: ["); + for (int k = 0; k < t->n_dims; ++k) { + if (k > 0) { printf(", "); } + printf("%d", (int)t->ne[k]); + } + printf("]\n"); + +} + +bool check_gradient( + const char * op_name, + struct ggml_context * ctx0, + struct ggml_tensor * x[], + struct ggml_tensor * f, + int ndims, + int nargs, + float eps, + float max_error_abs, + float max_error_rel) { + + struct ggml_cgraph gf = ggml_build_forward (f); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_graph_compute(ctx0, &gf); + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx0, &gb); + + // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); + // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); + + for (int i = 0; i < nargs; ++i) { + const int nelements = ggml_nelements(x[i]); + for (int k = 0; k < nelements; ++k) { + // compute gradient using finite differences + const float x0 = get_element(x[i], k); + const float xm = x0 - eps; + const float xp = x0 + eps; + set_element(x[i], k, xp); + ggml_graph_compute(ctx0, &gf); + + const float f0 = ggml_get_f32_1d(f, 0); + + set_element(x[i], k, xm); + ggml_graph_compute(ctx0, &gf); + + const float f1 = ggml_get_f32_1d(f, 0); + + const float g0 = (f0 - f1)/(2.0f*eps); + + set_element(x[i], k, x0); + + // compute gradient using backward graph + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx0, &gb); + + const float g1 = get_element(x[i]->grad, k); + + const float error_abs = fabsf(g0 - g1); + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + + if (error_abs > max_error_abs || error_rel > max_error_rel) { + printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", + op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel); + //assert(false); + return false; + } + } + } + + return true; +} + +// TODO: clean-up this .. +bool check_mat_mul( + const struct ggml_tensor * y, + const struct ggml_tensor * x0, + const struct ggml_tensor * x1) { + float * dst = (float *) y->data; + float * src0 = (float *) x0->data; + float * src1 = (float *) x1->data; + + const int nc = x0->ne[1]; + const int nr = x1->ne[1]; + const int nk = x0->ne[0]; + + GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk); + + GGML_PRINT_DEBUG("x0:\n"); + for (int j = 0; j < x0->ne[1]; ++j) { + for (int i = 0; i < x0->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]); + } + GGML_PRINT_DEBUG("\n"); + } + GGML_PRINT_DEBUG("\n"); + + GGML_PRINT_DEBUG("x1:\n"); + for (int j = 0; j < x1->ne[1]; ++j) { + for (int i = 0; i < x1->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]); + } + GGML_PRINT_DEBUG("\n"); + } + GGML_PRINT_DEBUG("\n"); + + GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]); + for (int j = 0; j < y->ne[1]; ++j) { + for (int i = 0; i < y->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]); + } + GGML_PRINT_DEBUG("\n"); + } + + for (int i = 0; i < nr; ++i) { + for (int j = 0; j < nc; ++j) { + float sum = 0.0f; + + for (int k = 0; k < nk; ++k) { + sum += src0[j*nk + k]*src1[i*nk + k]; + } + + if (fabsf(dst[i*nc + j] - sum) > 1e-5f) { + fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum); + assert(false); + return false; + } + } + } + + return true; +} + +#define NUM_PERMUTATIONS (4*3*2*1) + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 128*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + int64_t ne[4]; + + int all_permutations[4 * NUM_PERMUTATIONS]; + { + int count = 0; + for (int ax0=0; ax0<4; ++ax0) { + for (int ax1=0; ax1<4; ++ax1) { + if (ax1 == ax0) continue; + for (int ax2=0; ax2<4; ++ax2) { + if (ax2 == ax0) continue; + if (ax2 == ax1) continue; + for (int ax3=0; ax3<4; ++ax3) { + if (ax3 == ax0) continue; + if (ax3 == ax1) continue; + if (ax3 == ax2) continue; + assert(count < NUM_PERMUTATIONS); + all_permutations[count*4+0] = ax0; + all_permutations[count*4+1] = ax1; + all_permutations[count*4+2] = ax2; + all_permutations[count*4+3] = ax3; + ++count; + } + } + } + } + } + + + // original loop: 1000 + int niter = 4; + const char *env = getenv("GGML_NLOOP"); + if (env != NULL) { + niter = atoi(env); + } + if (argc > 1) { + niter = atoi(argv[1]); + } + for (int iter = 0; iter < niter; ++iter) { + printf("test-grad0: iter:%d/%d\n", iter, niter); + struct ggml_context * ctx0 = ggml_init(params); + + get_random_dims(ne, 4); + + struct ggml_tensor * x[MAX_NARGS]; + + // add + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); + + check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f); + } + } + + // sub + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1])); + + check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // mul + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1])); + + check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // div + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1])); + + check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f); + } + } + + // sqr + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0])); + + check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // sqrt + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0])); + + check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f); + } + } + + // log + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0])); + + check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f); + } + } + + // sum + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, x[0]); + + check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + + // sum_rows + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0]))); + + check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); + } + } + + // repeat + { + int64_t ne2[4]; + get_random_dims(ne2, 4); + + ne2[0] = ne[0] * ne2[0]; + ne2[1] = ne[1] * ne2[1]; + ne2[2] = 1; + ne2[3] = 1; + + const int nargs = 1; + for (int ndims = 1; ndims <= 2; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1])))); + + check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); + } + + } + + // abs (finite differences do not work) + //{ + // const int nargs = 1; + + // for (int ndims = 1; ndims <= 2; ++ndims) { + // for (int i = 0; i < nargs; ++i) { + // x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + // ggml_set_param(ctx0, x[i]); + // } + + // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0])); + + // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f); + // } + //} + + // mul_mat + { + const int nargs = 2; + + for (int ndims = 2; ndims <= 2; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + { + int64_t ne2[4]; + get_random_dims(ne2, 4); + ne2[0] = ne[0]; + x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); + } + + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + + struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]); + struct ggml_tensor * f = ggml_sum(ctx0, m); + + GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims); + + check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + check_mat_mul(m, x[1], x[0]); + } + } + + // silu + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0])); + +#ifdef GGML_SILU_FP16 + // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds. + check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY); +#else + check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); +#endif + } + } + + // rms_norm + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0])); + + check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY); + } + } + + // scale + { + const int nargs = 2; + + int64_t ne2[4]; + ne2[0] = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1])); + + check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // cpy + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + // x[1] is overwritten by x[0], so the gradients don't propagate to x[1] + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); + + check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // reshape (1d->nd) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + int64_t ne2[4]; + ne2[0] = 1; + ne2[1] = 1; + ne2[2] = 1; + ne2[3] = 1; + for (int i = 0; i < ndims; ++i) { + ne2[0] *= ne[i]; + } + x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); + x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); + check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // reshape (nd->1d) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + int64_t ne2[4]; + ne2[0] = 1; + ne2[1] = 1; + ne2[2] = 1; + ne2[3] = 1; + for (int i = 0; i < ndims; ++i) { + ne2[0] *= ne[i]; + } + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); + check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 1d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + + const int nargs = 2; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 1); + while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 1); + } + + x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); + const int offset = irand(max_offset) * ggml_element_size(x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 2d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 2; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 2); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 2); + } + + x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + const int offset = offsets[0] + offsets[1]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 3d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 3; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 3); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 3); + } + + x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + offsets[2] = irand(max_offsets[2]) * x[0]->nb[2]; + const int offset = offsets[0] + offsets[1] + offsets[2]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 4d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 4; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 4); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 4); + } + + x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]); + max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + offsets[2] = irand(max_offsets[2]) * x[0]->nb[2]; + offsets[3] = irand(max_offsets[3]) * x[0]->nb[3]; + const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // set_1d + { + int64_t ne2[4]; + + const int nargs = 2; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 1); + while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 1); + } + + x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); + const int offset = irand(max_offset) * ggml_element_size(x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset)); + + check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // set_2d + { + int64_t ne2[4]; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 1; + for (int ndims = 2; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 2); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 2); + } + + x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + const int offset = offsets[0] + offsets[1]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset)); + + check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_1d + { + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + + const int k0 = irand(ggml_nelements(x[0])); + const int k1 = irand(ggml_nelements(x[0])); + const int i0 = MIN(k0, k1); + const int i1 = MAX(k0, k1); + + const int offset = i0 * sizeof(float); + const int nelem = i1 - i0; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset)); + + check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_2d + { + int64_t ne2[4]; + int64_t nb2[4]; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + + get_random_dims(ne2, 2); + while (ne2[0]*ne2[1] > ggml_nelements(x[0])) { + get_random_dims(ne2, 2); + } + const int count = ne2[0]*ne2[1]; + + nb2[0] = sizeof(float); + nb2[1] = nb2[0]*ne2[0]; + + ggml_set_param(ctx0, x[0]); + + const int max_offset = ggml_nelements(x[0]) - count; + const int offset = irand(max_offset+1) * sizeof(float); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset)); + + check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_3d + { + int64_t ne2[4] = {1,1,1,1}; + int64_t nb2[4] = {0,0,0,0}; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + + get_random_dims(ne2, 3); + while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) { + get_random_dims(ne2, 3); + } + const int count = ne2[0]*ne2[1]*ne2[2]; + + nb2[0] = sizeof(float); + nb2[1] = nb2[0]*ne2[0]; + nb2[2] = nb2[1]*ne2[1]; + + ggml_set_param(ctx0, x[0]); + + const int max_offset = ggml_nelements(x[0]) - count; + const int offset = irand(max_offset+1) * sizeof(float); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset)); + + check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // permute + { + int64_t ne2[4]; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) + { + // ggml_permute will set axes of dimensions below n_dims to 1. + // to make ggml_permute work correctly on all axes, + // the input tensor needs maximal n_dim of 4. + for (int i=0; i +#include +#include +#include + +#define MAX_NARGS 2 + + +// +// logging +// +#define GGML_DEBUG 0 +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + + +float frand() { + return (float)rand()/(float)RAND_MAX; +} + +int irand(int n) { + return rand()%n; +} + +void get_random_dims(int64_t * dims, int ndims) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = 1 + irand(4); + } +} + +void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = min + irand(max-min); + } +} + + +struct ggml_tensor * get_random_tensor( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +float get_element(const struct ggml_tensor * t, int idx) { + return ((float *)t->data)[idx]; +} + +void set_element(struct ggml_tensor * t, int idx, float value) { + ((float *)t->data)[idx] = value; +} + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 1024*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + struct ggml_context * ctx = ggml_init(params); + + int64_t ne1[4] = {4, 1024, 1, 1}; + int64_t ne2[4] = {4, 2048, 1, 1};; + int64_t ne3[4] = {1024, 2048, 1, 1}; + + struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1); + struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1); + ggml_set_param(ctx, a); + ggml_set_param(ctx, b); + + struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1); + + struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b); + struct ggml_tensor * d = ggml_sub(ctx, c, ab); + struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); + + + struct ggml_cgraph ge = ggml_build_forward(e); + ggml_graph_reset (&ge); + ggml_graph_compute(ctx, &ge); + const float fe = ggml_get_f32_1d(e, 0); + printf("%s: e = %.4f\n", __func__, fe); + + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + + ggml_opt(ctx, opt_params, e); + + ggml_graph_reset (&ge); + ggml_graph_compute(ctx, &ge); + const float fe_opt = ggml_get_f32_1d(e, 0); + printf("%s: original e = %.4f\n", __func__, fe); + printf("%s: optimized e = %.4f\n", __func__, fe_opt); + + const bool success = (fe_opt <= fe); + assert(success); + + ggml_free(ctx); + return success ? 0 : -1; +} +// int64_t ne1[4] = {4, 128, 1, 1}; +// int64_t ne2[4] = {4, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 25890.9375 +// main: optimized e = 10094.7031 + +// int64_t ne1[4] = {8, 128, 1, 1}; +// int64_t ne2[4] = {8, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 39429.5078 +// main: optimized e = 9275.8936 + +// int64_t ne1[4] = {16, 128, 1, 1}; +// int64_t ne2[4] = {16, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 68371.1328 +// main: optimized e = 7854.4502 + + +// int64_t ne1[4] = {32, 128, 1, 1}; +// int64_t ne2[4] = {32, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 126061.1953 +// main: optimized e = 5451.0166 + +// int64_t ne1[4] = {4, 1024, 1, 1}; +// int64_t ne2[4] = {4, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 1620817.8750 +// main: optimized e = 698387.6875 + +// another run on M1 +// int64_t ne1[4] = {4, 1024, 1, 1}; +// int64_t ne2[4] = {4, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 1629595.6250 +// main: optimized e = 698169.1250 + +// int64_t ne1[4] = {32, 1024, 1, 1}; +// int64_t ne2[4] = {32, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 8146770.5000 +// main: optimized e = 651119.1250