22 lines
509 B
Go
22 lines
509 B
Go
package codec
|
|
|
|
import "github.com/dlclark/regexp2"
|
|
|
|
func NewP50kEdit() *Codec {
|
|
p50kBaseVocabOnce.Do(p50kBaseVocabInit)
|
|
|
|
splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None)
|
|
|
|
return &Codec{
|
|
name: "p50k_edit",
|
|
vocabulary: p50kBaseVocab,
|
|
splitRegexp: splitRegexp,
|
|
specialTokens: map[string]uint{
|
|
"<|endoftext|>": 50256,
|
|
"<|fim_prefix|>": 50281,
|
|
"<|fim_middle|>": 50282,
|
|
"<|fim_suffix|>": 50283,
|
|
},
|
|
}
|
|
}
|