22 lines
667 B
Go
22 lines
667 B
Go
package codec
|
|
|
|
import "github.com/dlclark/regexp2"
|
|
|
|
func NewO200kBase() *Codec {
|
|
o200kBaseVocabOnce.Do(o200kBaseVocabInit)
|
|
|
|
splitRegexp := regexp2.MustCompile(
|
|
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
regexp2.None)
|
|
|
|
return &Codec{
|
|
name: "o200k_base",
|
|
vocabulary: o200kBaseVocab,
|
|
splitRegexp: splitRegexp,
|
|
specialTokens: map[string]uint{
|
|
"<|endoftext|>": 199999,
|
|
"<|endofprompt|>": 200018,
|
|
},
|
|
}
|
|
}
|