113 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package tokenizer
import (
"errors"
"fmt"
"sync"
"github.com/tiktoken-go/tokenizer"
)
// A sample text to generate prompts from. Using Moby Dick chapter 1 start.
const sampleText = `Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking peoples hats off—then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost all men in their degree, some time or other, cherish very nearly the same feelings towards the ocean with me. There now is your insular city of the Manhattoes, belted round by wharves as Indian isles by coral reefs—commerce surrounds it with her surf. Right and left, the streets take you waterward. Its extreme downtown is the battery, where that noble mole is washed by waves, and cooled by breezes, which a few hours previous were out of sight of land. Look at the crowds of water-gazers there. Circumambulate the city of a dreamy Sabbath afternoon. Go from Corlears Hook to Coenties Slip, and from thence, by Whitehall, northward. What do you see?—Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries. Some leaning against the spiles; some seated upon the pier-heads; some looking over the bulwarks of ships from China; some high aloft in the rigging, as if striving to get a still better seaward peep. But these are all landsmen; of week days pent up in lath and plaster—tied to counters, nailed to benches, worried by ledgers. How then is this? Are the green fields gone? What do they here? But look! here come more crowds, pacing straight for the water, and seemingly bound for a dive. Strange! Nothing will content them but the extremest limit of the land; loitering under the shady lee of yonder warehouses will not suffice. No. They must get just as nigh the water as they possibly can without falling in. And there they stand—miles of them—leagues. Inlanders all, they come from lanes and alleys, streets and avenues—north, east, south, and west. Yet here they all unite. Tell me, does the magnetic virtue of the needles of the compasses of all those ships attract them thither?`
var (
codec tokenizer.Codec
encodedText []uint
initErr error
codecMutex sync.Mutex // Mutex to protect concurrent access to the codec
)
func init() {
// Initialize the tokenizer and encode the sample text once
codec, initErr = tokenizer.Get(tokenizer.Cl100kBase)
if initErr != nil {
initErr = fmt.Errorf("failed to get tokenizer codec: %w", initErr)
return
}
ids, _, err := codec.Encode(sampleText)
if err != nil {
initErr = fmt.Errorf("failed to encode sample text: %w", err)
return
}
encodedText = ids
}
// GeneratePrompt generates a text prompt with approximately the target number of tokens.
func GeneratePrompt(targetTokenCount int) (string, error) {
// Check initialization errors first
if initErr != nil {
return "", fmt.Errorf("tokenizer initialization failed: %w", initErr)
}
if codec == nil {
// This case should theoretically be covered by initErr check, but belt and suspenders
return "", errors.New("tokenizer codec is not initialized")
}
if targetTokenCount <= 0 {
return "", errors.New("target token count must be positive")
}
var selectedTokens []uint
// Repeat or truncate the encoded text to match the target count
if targetTokenCount > len(encodedText) {
// Repeat tokens until target is met
selectedTokens = make([]uint, 0, targetTokenCount)
for len(selectedTokens) < targetTokenCount {
remaining := targetTokenCount - len(selectedTokens)
if remaining >= len(encodedText) {
selectedTokens = append(selectedTokens, encodedText...)
} else {
selectedTokens = append(selectedTokens, encodedText[:remaining]...)
}
}
} else {
// Select the first N tokens
selectedTokens = encodedText[:targetTokenCount]
}
// --- Protect the call to Decode ---
codecMutex.Lock()
defer codecMutex.Unlock() // Ensure unlock happens even on error
// Decode the selected tokens back to text
prompt, err := codec.Decode(selectedTokens)
if err != nil {
// Log the problematic token slice for debugging if needed
// fmt.Printf("Failed to decode tokens: %v\n", selectedTokens)
return "", fmt.Errorf("failed to decode %d selected tokens: %w", len(selectedTokens), err)
}
// ---------------------------------
return prompt, nil
}
// CountTokensInText counts the number of tokens in the given text using the initialized codec.
func CountTokensInText(text string) (int, error) {
// Check initialization errors first
if initErr != nil {
return 0, fmt.Errorf("tokenizer initialization failed: %w", initErr)
}
if codec == nil {
return 0, errors.New("tokenizer codec is not initialized")
}
// --- Protect the call to Encode --- Must protect even read-only operations if codec has internal state?
codecMutex.Lock()
defer codecMutex.Unlock()
ids, _, err := codec.Encode(text)
if err != nil {
return 0, fmt.Errorf("failed to encode text for token counting: %w", err)
}
// ---------------------------------
return len(ids), nil
}
// CheckInitStatus returns any error that occurred during package initialization.
func CheckInitStatus() error {
return initErr
}