From e4422e299c10c7e84c8e987770ef40d31905a76b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Apr 2023 18:15:39 +0300 Subject: [PATCH] ggml : better PERF prints + support "LLAMA_PERF=1 make" --- Makefile | 4 ++++ ggml.c | 4 ++-- llama.cpp | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index b297959..0c7b654 100644 --- a/Makefile +++ b/Makefile @@ -117,6 +117,10 @@ ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg endif +ifdef LLAMA_PERF + CFLAGS += -DGGML_PERF + CXXFLAGS += -DGGML_PERF +endif ifneq ($(filter aarch64%,$(UNAME_M)),) CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native diff --git a/ggml.c b/ggml.c index 3ee2d08..23dae2d 100644 --- a/ggml.c +++ b/ggml.c @@ -11239,7 +11239,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { perf_total_per_op_us[node->op] += node->perf_time_us; - GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, @@ -11253,7 +11253,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], GGML_OP_LABEL[node->op]); diff --git a/llama.cpp b/llama.cpp index 34327ec..8c1d657 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1250,9 +1250,11 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, inpL); ggml_graph_compute (ctx0, &gf); +#ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) // requires GGML_PERF to be defined - //ggml_graph_print(&gf); + ggml_graph_print(&gf); +#endif // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) {