• async function printTokens() {
        const tokens_per_line=20;
        const output = await navigator.clipboard.readText();
        // clip uses </w> for a word break, T5 uses ▁, GLM uses Ġ
        const tokens = JSON.parse(output)[0].tokens.map(s => s.replace(/(?:<\/w>|Ġ)/g, ' '));
        let pos=0;
        for (let index = 0; index < tokens.length; index += tokens_per_line) {
            console.log(`${index} ${tokens.slice(index, index + tokens_per_line).join("").replace(/(?:\s‧|‧\s|‧▁|^▁)/g, ' ')}`);
        }
    }
  • Or do all the tokenizing in the browser: https://sd-tokenizer.rocker.boo/

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment