23 lines
598 B
Go
23 lines
598 B
Go
package codec
|
|
|
|
import "github.com/dlclark/regexp2"
|
|
|
|
func NewCl100kBase() *Codec {
|
|
cl100kBaseVocabOnce.Do(cl100kBaseVocabInit)
|
|
|
|
splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None)
|
|
|
|
return &Codec{
|
|
name: "cl100k_base",
|
|
vocabulary: cl100kBaseVocab,
|
|
splitRegexp: splitRegexp,
|
|
specialTokens: map[string]uint{
|
|
"<|endoftext|>": 100257,
|
|
"<|fim_prefix|>": 100258,
|
|
"<|fim_middle|>": 100259,
|
|
"<|fim_suffix|>": 100260,
|
|
"<|endofprompt|>": 100276,
|
|
},
|
|
}
|
|
}
|