package tokenizer // Package tokenizer provides functions for encoding and decoding text using // different tokenization schemes. // // Encoding Formats // // The following encoding formats are supported: // - Cl100kBase // - R50kBase // - P50kBase // - P50kEdit // // Alternatively you can request a tokenizer using OpenAI's model name, the // following OpenAI models are supported: // - O1Preview // - O1Mini // - GPT4 // - GPT35Turbo // - TextEmbeddingAda002 // - TextDavinci003 // - TextDavinci002 // - CodeDavinci002 // - CodeDavinci001 // - CodeCushman002 // - CodeCushman001 // - DavinciCodex // - CushmanCodex // - TextDavinci001 // - TextCurie001 // - TextBabbage001 // - TextAda001 // - Davinci // - Curie // - Babbage // - Ada // - TextSimilarityDavinci001 // - TextSimilarityCurie001 // - TextSimilarityBabbage001 // - TextSimilarityAda001 // - TextSearchDavinciDoc001 // - TextSearchCurieDoc001 // - TextSearchAdaDoc001 // - TextSearchBabbageDoc001 // - CodeSearchBabbageCode001 // - CodeSearchAdaCode001 // - TextDavinciEdit001 // - CodeDavinciEdit001 // // Usage Example // // Here is an example of how to encode a string using the `ForModel` function: // // package main // // import ( // "fmt" // "github.com/tiktoken-go/tokenizer" // ) // // func main() { // enc, err := tokenizer.Get(tokenizer.Cl100kBase) // if err != nil { // panic("oh oh") // } // // // this should print a list of token ids // ids, token, _ := enc.Encode("supercalifragilistic") // fmt.Println(ids) // // // this should print the original string back // text, _ := enc.Decode(ids) // fmt.Println(text) //} import ( "errors" "strings" "github.com/tiktoken-go/tokenizer/codec" ) var ( ErrModelNotSupported = errors.New("model not supported") ErrEncodingNotSupported = errors.New("encoding not supported") ) type Codec interface { GetName() string Count(string) (int, error) Encode(string) ([]uint, []string, error) Decode([]uint) (string, error) } type Model string const ( O1Preview Model = "o1-preview" O1Mini Model = "o1-mini" O3Mini Model = "o3-mini" GPT4o Model = "gpt-4o" GPT4 Model = "gpt-4" GPT35Turbo Model = "gpt-3.5-turbo" GPT35 Model = "gpt-3.5" TextEmbeddingAda002 Model = "text-embedding-ada-002" TextDavinci003 Model = "text-davinci-003" TextDavinci002 Model = "text-davinci-002" CodeDavinci002 Model = "code-davinci-002" CodeDavinci001 Model = "code-davinci-001" CodeCushman002 Model = "code-cushman-002" CodeCushman001 Model = "code-cushman-001" DavinciCodex Model = "davinci-codex" CushmanCodex Model = "cushman-codex" TextDavinci001 Model = "text-davinci-001" TextCurie001 Model = "text-curie-001" TextBabbage001 Model = "text-babbage-001" TextAda001 Model = "text-ada-001" Davinci Model = "davinci" Curie Model = "curie" Babbage Model = "babbage" Ada Model = "ada" TextSimilarityDavinci001 Model = "text-similarity-davinci-001" TextSimilarityCurie001 Model = "text-similarity-curie-001" TextSimilarityBabbage001 Model = "text-similarity-babbage-001" TextSimilarityAda001 Model = "text-similarity-ada-001" TextSearchDavinciDoc001 Model = "text-search-davinci-doc-001" TextSearchCurieDoc001 Model = "text-search-curie-doc-001" TextSearchAdaDoc001 Model = "text-search-ada-doc-001" TextSearchBabbageDoc001 Model = "text-search-babbage-doc-001" CodeSearchBabbageCode001 Model = "code-search-babbage-code-001" CodeSearchAdaCode001 Model = "code-search-ada-code-001" TextDavinciEdit001 Model = "text-davinci-edit-001" CodeDavinciEdit001 Model = "code-davinci-edit-001" GPT2 Model = "gpt2" ) type Encoding string const ( GPT2Enc Encoding = "gpt2" R50kBase Encoding = "r50k_base" P50kBase Encoding = "p50k_base" P50kEdit Encoding = "p50k_edit" Cl100kBase Encoding = "cl100k_base" O200kBase Encoding = "o200k_base" ) var modelPrefixToEncoding map[Model]Encoding = map[Model]Encoding{ "o1-": O200kBase, // chat "chatgpt-4o-": O200kBase, "gpt-4o-": O200kBase, "gpt-4-": Cl100kBase, "gpt-3.5-turbo-": Cl100kBase, "gpt-35-turbo-": Cl100kBase, // fine-tuned "ft:gpt-4": Cl100kBase, "ft:gpt-3.5-turbo": Cl100kBase, "ft:davinci-002": Cl100kBase, "ft:babbage-002": Cl100kBase, } // Get returns a new instance of a Codec implementation based on the specified // encoding format. The returned Codec instance can be used to encode (tokenize) // and decode (reassemble) text. If the specified encoding is not supported, // an error is returned. func Get(encoding Encoding) (Codec, error) { switch encoding { case O200kBase: return codec.NewO200kBase(), nil case Cl100kBase: return codec.NewCl100kBase(), nil case R50kBase: return codec.NewR50kBase(), nil case P50kBase: return codec.NewP50kBase(), nil case P50kEdit: return codec.NewP50kEdit(), nil default: return nil, ErrEncodingNotSupported } } // ForModel returns a new instance of a Codec implementation based on the // specified OpenAI model. If the specified model is not supported, an error // is returned. func ForModel(model Model) (Codec, error) { switch model { case O1Preview, O1Mini, GPT4o, O3Mini: return Get(O200kBase) case GPT4, GPT35, GPT35Turbo, TextEmbeddingAda002: return Get(Cl100kBase) case TextDavinci003, TextDavinci002, CodeDavinci001, CodeDavinci002, CodeCushman002, CodeCushman001, DavinciCodex, CushmanCodex: return Get(P50kBase) case TextDavinci001, TextCurie001, TextBabbage001, TextAda001, Davinci, Curie, Babbage, Ada, TextSimilarityDavinci001, TextSimilarityCurie001, TextSimilarityBabbage001, TextSimilarityAda001, TextSearchDavinciDoc001, TextSearchCurieDoc001, TextSearchAdaDoc001, TextSearchBabbageDoc001, CodeSearchBabbageCode001, CodeSearchAdaCode001: return Get(R50kBase) case TextDavinciEdit001, CodeDavinciEdit001: return Get(P50kEdit) case GPT2: return Get(GPT2Enc) default: for prefix, enc := range modelPrefixToEncoding { if strings.HasPrefix(string(model), string(prefix)) { return Get(enc) } } return nil, ErrModelNotSupported } }