23 lines
598 B
Go

package codec
import "github.com/dlclark/regexp2"
func NewCl100kBase() *Codec {
cl100kBaseVocabOnce.Do(cl100kBaseVocabInit)
splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None)
return &Codec{
name: "cl100k_base",
vocabulary: cl100kBaseVocab,
splitRegexp: splitRegexp,
specialTokens: map[string]uint{
"<|endoftext|>": 100257,
"<|fim_prefix|>": 100258,
"<|fim_middle|>": 100259,
"<|fim_suffix|>": 100260,
"<|endofprompt|>": 100276,
},
}
}