mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 23:29:44 +00:00
Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
This commit is contained in:
parent
7b8dbcb78b
commit
a6bdc47cba
1 changed files with 24 additions and 1 deletions
25
ggml.c
25
ggml.c
|
@ -1122,13 +1122,36 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
|
||||||
#define GGML_F16_EPR 8
|
#define GGML_F16_EPR 8
|
||||||
|
|
||||||
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
||||||
// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
|
|
||||||
|
|
||||||
#define GGML_F32Cx8 __m256
|
#define GGML_F32Cx8 __m256
|
||||||
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
||||||
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
||||||
|
|
||||||
|
#if defined(__F16C__)
|
||||||
|
// the _mm256_cvt intrinsics require F16C
|
||||||
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
||||||
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
||||||
|
#else
|
||||||
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
||||||
|
float tmp[8];
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++)
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
|
|
||||||
|
return _mm256_loadu_ps(tmp);
|
||||||
|
}
|
||||||
|
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||||
|
float arr[8];
|
||||||
|
|
||||||
|
_mm256_storeu_ps(arr, y);
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++)
|
||||||
|
x[i] = GGML_FP16_TO_FP32(arr[i]);
|
||||||
|
}
|
||||||
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
||||||
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
||||||
#define GGML_F32Cx8_ADD _mm256_add_ps
|
#define GGML_F32Cx8_ADD _mm256_add_ps
|
||||||
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
||||||
|
|
Loading…
Reference in a new issue