mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 15:29:43 +00:00
llama : support more diverse tokenizers? (#2420)
* supporting more diverse tokenizers * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
d73b8d48b4
commit
ee1b497c98
1 changed files with 3 additions and 1 deletions
|
@ -1924,7 +1924,9 @@ struct llama_tokenizer {
|
|||
if (token == vocab_.token_to_id.end()) {
|
||||
// output any symbols that did not form tokens as bytes.
|
||||
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
// NOTE: old version, before #2420 - not sure what are the implications of this
|
||||
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
||||
output.push_back(token_id);
|
||||
}
|
||||
} else {
|
||||
|
|
Loading…
Reference in a new issue