internal/tokenizer/tokenizer_test.go

//
//  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
//

package tokenizer

import (
	"strings"
	"testing"
)

// saveEngineType saves the current engineTypeProvider and returns a function
// to restore it. Use this when a test modifies the engine type to avoid
// leaking global state between tests.
func saveEngineType() func() {
	original := engineTypeProvider
	return func() { engineTypeProvider = original }
}

// ---------------------------------------------------------------------------
// NumTokensFromString tests
// ---------------------------------------------------------------------------

func TestNumTokensFromString_Empty(t *testing.T) {
	if got := NumTokensFromString(""); got != 0 {
		t.Errorf("expected 0 for empty string, got %d", got)
	}
}

func TestNumTokensFromString_Positive(t *testing.T) {
	for _, s := range []string{"hello world", "你好世界"} {
		if got := NumTokensFromString(s); got <= 0 {
			t.Errorf("NumTokensFromString(%q) = %d, want >0", s, got)
		}
	}
}

func TestNumTokensFromString_VariedInputs(t *testing.T) {
	tests := []struct {
		name  string
		input string
	}{
		{"ascii letters", "hello world"},
		{"chinese characters", "你好世界"},
		{"japanese characters", "こんにちは世界"},
		{"korean characters", "안녕하세요세계"},
		{"emoji", "👋 hello 🌍"},
		{"numbers only", "1234567890"},
		{"special chars", "a+b=c; d!=e"},
		{"newlines and tabs", "line1\nline2\tindented"},
		{"mixed content", "RAGFlow 是一款 开源的 RAG (Retrieval-Augmented Generation) 引擎"},
	}
	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			got := NumTokensFromString(tt.input)
			if got <= 0 {
				t.Errorf("NumTokensFromString(%q) = %d, want >0", tt.input, got)
			}
		})
	}
}

func TestNumTokensFromString_Consistency(t *testing.T) {
	inputs := []string{"hello world", "你好世界", "a+b=c; d!=e"}
	for _, s := range inputs {
		first := NumTokensFromString(s)
		second := NumTokensFromString(s)
		if first != second {
			t.Errorf("NumTokensFromString(%q) is not consistent: %d vs %d", s, first, second)
		}
	}
}

func TestNumTokensFromString_LongString(t *testing.T) {
	long := strings.Repeat("the quick brown fox jumps over the lazy dog. ", 200)
	got := NumTokensFromString(long)
	if got <= 0 {
		t.Errorf("NumTokensFromString(long_string) = %d, want >0", got)
	}
}

func TestNumTokensFromString_WhitespaceOnly(t *testing.T) {
	for _, s := range []string{" ", "\t", "\n", "   "} {
		got := NumTokensFromString(s)
		// Whitespace strings should still produce tokens in BPE encoding
		if got == 0 {
			t.Logf("NumTokensFromString(%q) = %d", s, got)
		}
	}
}

// ---------------------------------------------------------------------------
// RegisterEngineType tests
// ---------------------------------------------------------------------------

func TestRegisterEngineType_Basic(t *testing.T) {
	restore := saveEngineType()
	defer restore()

	RegisterEngineType(func() string { return "infinity" })
	if got := engineTypeProvider(); got != "infinity" {
		t.Errorf("expected 'infinity', got %q", got)
	}
}

func TestRegisterEngineType_Overwrite(t *testing.T) {
	restore := saveEngineType()
	defer restore()

	RegisterEngineType(func() string { return "first" })
	RegisterEngineType(func() string { return "second" })
	if got := engineTypeProvider(); got != "second" {
		t.Errorf("expected 'second', got %q", got)
	}
}

// ---------------------------------------------------------------------------
// Tokenize tests
// ---------------------------------------------------------------------------

func TestTokenize_InfinityEngine(t *testing.T) {
	restore := saveEngineType()
	defer restore()
	RegisterEngineType(func() string { return "infinity" })

	inputs := []string{"hello world", "你好 世界", "", "a single word"}
	for _, input := range inputs {
		got, err := Tokenize(input)
		if err != nil {
			t.Errorf("Tokenize(%q) unexpected error: %v", input, err)
		}
		if got != input {
			t.Errorf("Tokenize(%q) = %q, want %q", input, got, input)
		}
	}
}

func TestTokenize_PoolNotInitialized(t *testing.T) {
	restore := saveEngineType()
	defer restore()
	// Ensure engine type is not "infinity" so we hit the pool path
	RegisterEngineType(func() string { return "" })

	_, err := Tokenize("hello world")
	if err == nil {
		t.Error("expected error when pool is not initialized, got nil")
	}
}

// ---------------------------------------------------------------------------
// FineGrainedTokenize tests
// ---------------------------------------------------------------------------

func TestFineGrainedTokenize_InfinityEngine(t *testing.T) {
	restore := saveEngineType()
	defer restore()
	RegisterEngineType(func() string { return "infinity" })

	inputs := []string{"hello world", "测试 分词", ""}
	for _, input := range inputs {
		got, err := FineGrainedTokenize(input)
		if err != nil {
			t.Errorf("FineGrainedTokenize(%q) unexpected error: %v", input, err)
		}
		if got != input {
			t.Errorf("FineGrainedTokenize(%q) = %q, want %q", input, got, input)
		}
	}
}

func TestFineGrainedTokenize_PoolNotInitialized(t *testing.T) {
	restore := saveEngineType()
	defer restore()
	RegisterEngineType(func() string { return "" })

	_, err := FineGrainedTokenize("hello world")
	if err == nil {
		t.Error("expected error when pool is not initialized, got nil")
	}
}

// ---------------------------------------------------------------------------
// Error-path tests for functions that require the pool
// ---------------------------------------------------------------------------

func TestTokenizeWithPosition_PoolNotInitialized(t *testing.T) {
	_, err := TokenizeWithPosition("hello world")
	if err == nil {
		t.Error("expected error when pool is not initialized, got nil")
	}
}

func TestAnalyze_PoolNotInitialized(t *testing.T) {
	_, err := Analyze("hello world")
	if err == nil {
		t.Error("expected error when pool is not initialized, got nil")
	}
}

func TestGetTermFreq_PoolNotInitialized(t *testing.T) {
	got := GetTermFreq("hello")
	if got != 0 {
		t.Errorf("expected 0 when pool is not initialized, got %d", got)
	}
}

func TestGetTermTag_PoolNotInitialized(t *testing.T) {
	got := GetTermTag("hello")
	if got != "" {
		t.Errorf("expected empty string when pool is not initialized, got %q", got)
	}
}

// ---------------------------------------------------------------------------
// Global state tests
// ---------------------------------------------------------------------------

func TestGetPoolStats_Nil(t *testing.T) {
	// Note: globalPool is nil by default in unit tests (pool not initialized)
	stats := GetPoolStats()
	if stats == nil {
		t.Fatal("GetPoolStats returned nil")
	}
	init, ok := stats["initialized"]
	if !ok {
		t.Fatal("missing 'initialized' key")
	}
	if init.(bool) {
		t.Error("expected initialized=false when pool is nil")
	}
}

func TestIsInitialized_Default(t *testing.T) {
	if IsInitialized() {
		t.Error("expected IsInitialized() = false when pool is not initialized")
	}
}

func TestClose_Nil(t *testing.T) {
	// Close should be safe to call with nil globalPool
	Close() // no panic = pass
}

func TestClose_NilGlobalPool(t *testing.T) {
	// Call Close directly after ensuring globalPool is nil
	// (concurrent test may have initialized it, so handle gracefully)
	defer func() {
		if r := recover(); r != nil {
			t.Errorf("Close() panicked: %v", r)
		}
	}()
	Close()
}
Implement OpenAI chat completions in GO (#16177) ### What problem does this PR solve? Implement OpenAI chat completions in GO POST /api/v1/openai/<chat_id>/chat/completions OpenAI chat cli: internal/development.md ### Type of change - [x] Refactoring 2026-06-18 18:07:27 +08:00			`//`
			`// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`

			`package tokenizer`

			`import (`
			`"strings"`
			`"testing"`
			`)`

			`// saveEngineType saves the current engineTypeProvider and returns a function`
			`// to restore it. Use this when a test modifies the engine type to avoid`
			`// leaking global state between tests.`
			`func saveEngineType() func() {`
			`original := engineTypeProvider`
			`return func() { engineTypeProvider = original }`
			`}`

			`// ---------------------------------------------------------------------------`
			`// NumTokensFromString tests`
			`// ---------------------------------------------------------------------------`

			`func TestNumTokensFromString_Empty(t *testing.T) {`
			`if got := NumTokensFromString(""); got != 0 {`
			`t.Errorf("expected 0 for empty string, got %d", got)`
			`}`
			`}`

			`func TestNumTokensFromString_Positive(t *testing.T) {`
			`for _, s := range []string{"hello world", "你好世界"} {`
			`if got := NumTokensFromString(s); got <= 0 {`
			`t.Errorf("NumTokensFromString(%q) = %d, want >0", s, got)`
			`}`
			`}`
			`}`

			`func TestNumTokensFromString_VariedInputs(t *testing.T) {`
			`tests := []struct {`
			`name string`
			`input string`
			`}{`
			`{"ascii letters", "hello world"},`
			`{"chinese characters", "你好世界"},`
			`{"japanese characters", "こんにちは世界"},`
			`{"korean characters", "안녕하세요세계"},`
			`{"emoji", "👋 hello 🌍"},`
			`{"numbers only", "1234567890"},`
			`{"special chars", "a+b=c; d!=e"},`
			`{"newlines and tabs", "line1\nline2\tindented"},`
			`{"mixed content", "RAGFlow 是一款开源的 RAG (Retrieval-Augmented Generation) 引擎"},`
			`}`
			`for _, tt := range tests {`
			`t.Run(tt.name, func(t *testing.T) {`
			`got := NumTokensFromString(tt.input)`
			`if got <= 0 {`
			`t.Errorf("NumTokensFromString(%q) = %d, want >0", tt.input, got)`
			`}`
			`})`
			`}`
			`}`

			`func TestNumTokensFromString_Consistency(t *testing.T) {`
			`inputs := []string{"hello world", "你好世界", "a+b=c; d!=e"}`
			`for _, s := range inputs {`
			`first := NumTokensFromString(s)`
			`second := NumTokensFromString(s)`
			`if first != second {`
			`t.Errorf("NumTokensFromString(%q) is not consistent: %d vs %d", s, first, second)`
			`}`
			`}`
			`}`

			`func TestNumTokensFromString_LongString(t *testing.T) {`
			`long := strings.Repeat("the quick brown fox jumps over the lazy dog. ", 200)`
			`got := NumTokensFromString(long)`
			`if got <= 0 {`
			`t.Errorf("NumTokensFromString(long_string) = %d, want >0", got)`
			`}`
			`}`

			`func TestNumTokensFromString_WhitespaceOnly(t *testing.T) {`
			`for _, s := range []string{" ", "\t", "\n", " "} {`
			`got := NumTokensFromString(s)`
			`// Whitespace strings should still produce tokens in BPE encoding`
			`if got == 0 {`
			`t.Logf("NumTokensFromString(%q) = %d", s, got)`
			`}`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// RegisterEngineType tests`
			`// ---------------------------------------------------------------------------`

			`func TestRegisterEngineType_Basic(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`

			`RegisterEngineType(func() string { return "infinity" })`
			`if got := engineTypeProvider(); got != "infinity" {`
			`t.Errorf("expected 'infinity', got %q", got)`
			`}`
			`}`

			`func TestRegisterEngineType_Overwrite(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`

			`RegisterEngineType(func() string { return "first" })`
			`RegisterEngineType(func() string { return "second" })`
			`if got := engineTypeProvider(); got != "second" {`
			`t.Errorf("expected 'second', got %q", got)`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Tokenize tests`
			`// ---------------------------------------------------------------------------`

			`func TestTokenize_InfinityEngine(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`
			`RegisterEngineType(func() string { return "infinity" })`

			`inputs := []string{"hello world", "你好世界", "", "a single word"}`
			`for _, input := range inputs {`
			`got, err := Tokenize(input)`
			`if err != nil {`
			`t.Errorf("Tokenize(%q) unexpected error: %v", input, err)`
			`}`
			`if got != input {`
			`t.Errorf("Tokenize(%q) = %q, want %q", input, got, input)`
			`}`
			`}`
			`}`

			`func TestTokenize_PoolNotInitialized(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`
			`// Ensure engine type is not "infinity" so we hit the pool path`
			`RegisterEngineType(func() string { return "" })`

			`_, err := Tokenize("hello world")`
			`if err == nil {`
			`t.Error("expected error when pool is not initialized, got nil")`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// FineGrainedTokenize tests`
			`// ---------------------------------------------------------------------------`

			`func TestFineGrainedTokenize_InfinityEngine(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`
			`RegisterEngineType(func() string { return "infinity" })`

			`inputs := []string{"hello world", "测试分词", ""}`
			`for _, input := range inputs {`
			`got, err := FineGrainedTokenize(input)`
			`if err != nil {`
			`t.Errorf("FineGrainedTokenize(%q) unexpected error: %v", input, err)`
			`}`
			`if got != input {`
			`t.Errorf("FineGrainedTokenize(%q) = %q, want %q", input, got, input)`
			`}`
			`}`
			`}`

			`func TestFineGrainedTokenize_PoolNotInitialized(t *testing.T) {`
			`restore := saveEngineType()`
			`defer restore()`
			`RegisterEngineType(func() string { return "" })`

			`_, err := FineGrainedTokenize("hello world")`
			`if err == nil {`
			`t.Error("expected error when pool is not initialized, got nil")`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Error-path tests for functions that require the pool`
			`// ---------------------------------------------------------------------------`

			`func TestTokenizeWithPosition_PoolNotInitialized(t *testing.T) {`
			`_, err := TokenizeWithPosition("hello world")`
			`if err == nil {`
			`t.Error("expected error when pool is not initialized, got nil")`
			`}`
			`}`

			`func TestAnalyze_PoolNotInitialized(t *testing.T) {`
			`_, err := Analyze("hello world")`
			`if err == nil {`
			`t.Error("expected error when pool is not initialized, got nil")`
			`}`
			`}`

			`func TestGetTermFreq_PoolNotInitialized(t *testing.T) {`
			`got := GetTermFreq("hello")`
			`if got != 0 {`
			`t.Errorf("expected 0 when pool is not initialized, got %d", got)`
			`}`
			`}`

			`func TestGetTermTag_PoolNotInitialized(t *testing.T) {`
			`got := GetTermTag("hello")`
			`if got != "" {`
			`t.Errorf("expected empty string when pool is not initialized, got %q", got)`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Global state tests`
			`// ---------------------------------------------------------------------------`

			`func TestGetPoolStats_Nil(t *testing.T) {`
			`// Note: globalPool is nil by default in unit tests (pool not initialized)`
			`stats := GetPoolStats()`
			`if stats == nil {`
			`t.Fatal("GetPoolStats returned nil")`
			`}`
			`init, ok := stats["initialized"]`
			`if !ok {`
			`t.Fatal("missing 'initialized' key")`
			`}`
			`if init.(bool) {`
			`t.Error("expected initialized=false when pool is nil")`
			`}`
			`}`

			`func TestIsInitialized_Default(t *testing.T) {`
			`if IsInitialized() {`
			`t.Error("expected IsInitialized() = false when pool is not initialized")`
			`}`
			`}`

			`func TestClose_Nil(t *testing.T) {`
			`// Close should be safe to call with nil globalPool`
			`Close() // no panic = pass`
			`}`

			`func TestClose_NilGlobalPool(t *testing.T) {`
			`// Call Close directly after ensuring globalPool is nil`
			`// (concurrent test may have initialized it, so handle gracefully)`
			`defer func() {`
			`if r := recover(); r != nil {`
			`t.Errorf("Close() panicked: %v", r)`
			`}`
			`}()`
			`Close()`
			`}`