118630931025update 0.59.6
e537e58b创建于 2025年4月29日历史提交
package tokenizer

main(): Int64 {
    let vocab_path = "./download/Qwen2-0.5B-Instruct/tokenizer.json"
    let system_prompt = "You are a helpful assistant."
    let tokenizer = HuggingfaceTokenizer()
    tokenizer.load_vocab(file_path: vocab_path)
    // -- test1 --- //
    println("===== test1 ===== ")
    let raw_text = "世界你好,hello world!"
    let encode_tokens1 = tokenizer.encode(raw_text)
    println("encode_tokens: ${encode_tokens1}")
    let decode_str1 = tokenizer.decode(encode_tokens1, skip_special_tokens: false)
    println("deocode_str: ${decode_str1}")
    println("===== ===== ===== ")

    // -- test2 -- //
    println("===== test2 ===== ")
    let raw_text2 = raw_text + "<|im_end|>"
    let encode_tokens2 = tokenizer.encode(raw_text2)
    println("encode_tokens: ${encode_tokens2}")
    let decode_str2 = tokenizer.decode(encode_tokens2, skip_special_tokens: false)
    println("deocode_str(with special)    : ${decode_str2}")
    let decode_str3 = tokenizer.decode(encode_tokens2, skip_special_tokens: true)
    println("deocode_str(without special) : ${decode_str3}")
    println("===== ===== ===== ")

    // --- test3 --- //
    println("===== test3 ===== ")
    let messages = ArrayList<Message>([
        Message(RoleType.System, system_prompt),
        Message(RoleType.User, raw_text)
    ])
    let new_text = tokenizer.apply_chat_template(messages, add_generation_prompt: true)
    println("new_text:\n ${new_text}")
    return 0
}