22 lines
667 B
Go

package codec
import "github.com/dlclark/regexp2"
func NewO200kBase() *Codec {
o200kBaseVocabOnce.Do(o200kBaseVocabInit)
splitRegexp := regexp2.MustCompile(
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
regexp2.None)
return &Codec{
name: "o200k_base",
vocabulary: o200kBaseVocab,
splitRegexp: splitRegexp,
specialTokens: map[string]uint{
"<|endoftext|>": 199999,
"<|endofprompt|>": 200018,
},
}
}