mirror of
https://github.com/vegu-ai/talemate.git
synced 2026-05-18 05:05:39 +02:00
0.37.0 - **Director Planning** — Multi-step todo lists in director chat plus a Generate long progress action for multi-beat scene arcs. - **Auto Narration** — Unified auto-narration replacing the old Narrate after Dialogue toggle, with a chance slider and weighted action mix. - **LLM Prompt Templates Manager** — Dedicated UI tab for viewing, creating, editing, and deleting prompt templates. - **Character Folders** — Collapsible folders in the World Editor character list, synced across linked scenes. - **OpenAI Compatible TTS** — Connect any number of OpenAI-compatible TTS servers in parallel. - **KoboldCpp TTS Auto-Setup** — KoboldCpp clients with a TTS model loaded register themselves as a TTS backend. - **Model Testing Harness** — Bundled scene that runs basic capability tests against any connected LLM. Plus 27 improvements and 28 bug fixes
352 lines
12 KiB
Python
352 lines
12 KiB
Python
"""Tests for talemate.util.__init__ pure helpers.
|
|
|
|
Covers count_tokens, limit_tokens, chunk_items_by_tokens, remove_substring_names,
|
|
select_best_texts_by_keyword, clean_id, and slugify.
|
|
"""
|
|
|
|
from talemate.scene_message import SceneMessage
|
|
from talemate.util import (
|
|
chunk_items_by_tokens,
|
|
clean_id,
|
|
count_tokens,
|
|
limit_tokens,
|
|
remove_substring_names,
|
|
select_best_texts_by_keyword,
|
|
slugify,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# count_tokens
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_count_tokens_empty_string_is_zero():
|
|
assert count_tokens("") == 0
|
|
|
|
|
|
def test_count_tokens_string_returns_positive_count():
|
|
# tiktoken encoding produces at least one token for short ASCII strings
|
|
n = count_tokens("hello world")
|
|
assert n >= 1
|
|
# And longer strings produce strictly more tokens
|
|
assert count_tokens("hello world goodbye world hello again") > n
|
|
|
|
|
|
def test_count_tokens_list_sums_components():
|
|
"""List of strings should count to the sum of individual counts."""
|
|
parts = ["alpha", "beta gamma", "delta"]
|
|
expected = sum(count_tokens(p) for p in parts)
|
|
assert count_tokens(parts) == expected
|
|
|
|
|
|
def test_count_tokens_empty_list_is_zero():
|
|
assert count_tokens([]) == 0
|
|
|
|
|
|
def test_count_tokens_scene_message_uses_str():
|
|
"""SceneMessage should be tokenized by its string representation."""
|
|
msg = SceneMessage(message="hello world")
|
|
assert count_tokens(msg) == count_tokens("hello world")
|
|
|
|
|
|
def test_count_tokens_unknown_type_returns_zero():
|
|
"""Unknown types log a warning and return 0."""
|
|
assert count_tokens(12345) == 0
|
|
assert count_tokens({"a": 1}) == 0
|
|
assert count_tokens(None) == 0
|
|
|
|
|
|
def test_count_tokens_nested_list():
|
|
"""count_tokens recursively sums nested lists of strings."""
|
|
nested = [["alpha", "beta"], "gamma"]
|
|
expected = count_tokens("alpha") + count_tokens("beta") + count_tokens("gamma")
|
|
assert count_tokens(nested) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# limit_tokens
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_limit_tokens_under_limit_returns_input_unchanged():
|
|
text = "line one\nline two\nline three"
|
|
# Plenty of headroom
|
|
assert limit_tokens(text, 1000) == text
|
|
|
|
|
|
def test_limit_tokens_drops_trailing_lines_until_under_limit():
|
|
"""When text exceeds the limit, trailing lines should be popped from the end."""
|
|
lines = [f"sentence number {i} in this paragraph" for i in range(20)]
|
|
text = "\n".join(lines)
|
|
# Use the same per-line counting strategy that the function uses internally
|
|
full_tokens = count_tokens(lines)
|
|
target = full_tokens // 2
|
|
result = limit_tokens(text, target)
|
|
# Function uses list-based count internally; verify the surviving lines are within budget
|
|
surviving_lines = result.split("\n")
|
|
assert count_tokens(surviving_lines) <= target
|
|
# Result must still be a prefix of the original (lines popped from the end)
|
|
assert text.startswith(result)
|
|
# First line must be preserved
|
|
assert surviving_lines[0] == lines[0]
|
|
# Some lines should have been dropped
|
|
assert len(surviving_lines) < len(lines)
|
|
|
|
|
|
def test_limit_tokens_returns_empty_when_first_line_exceeds_limit():
|
|
"""If even the first line is too long, all lines are popped."""
|
|
text = "this is a fairly long line of words\nanother line"
|
|
# Set limit too small so even one line cannot fit
|
|
result = limit_tokens(text, 0)
|
|
assert result == ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# chunk_items_by_tokens
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_chunk_items_by_tokens_groups_items_under_limit():
|
|
"""Multiple small items should pack into a chunk that fits the limit."""
|
|
items = ["one", "two", "three", "four"]
|
|
# Each item is ~1 token, so any reasonable limit packs them together
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=100))
|
|
assert chunks == [items]
|
|
|
|
|
|
def test_chunk_items_by_tokens_starts_new_chunk_when_full():
|
|
"""When adding an item would exceed the limit, a new chunk begins."""
|
|
items = ["alpha beta", "gamma delta", "epsilon zeta"]
|
|
# Set max_tokens to roughly the size of one item so each item gets its own chunk
|
|
max_tokens = max(count_tokens(i) for i in items)
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=max_tokens))
|
|
# Every item should be present, distributed across chunks
|
|
flattened = [item for chunk in chunks for item in chunk]
|
|
assert flattened == items
|
|
# No chunk should exceed the limit
|
|
for chunk in chunks:
|
|
assert count_tokens(chunk) <= max_tokens
|
|
# Multiple chunks since each item nearly fills the budget
|
|
assert len(chunks) >= 2
|
|
|
|
|
|
def test_chunk_items_by_tokens_oversized_item_yielded_alone():
|
|
"""An item larger than max_tokens is yielded as its own chunk."""
|
|
long_text = " ".join(["word"] * 200) # large
|
|
items = ["small one", long_text, "small two"]
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=5))
|
|
# The long item appears as a single-item chunk (size 1)
|
|
oversized_chunks = [c for c in chunks if c == [long_text]]
|
|
assert len(oversized_chunks) == 1
|
|
# All original items appear in the output
|
|
flattened = [i for c in chunks for i in c]
|
|
assert flattened == items
|
|
|
|
|
|
def test_chunk_items_by_tokens_filters_empty_when_filter_empty_true():
|
|
items = ["one", "", " ", "two", None]
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=100))
|
|
# Only "one" and "two" survive
|
|
assert chunks == [["one", "two"]]
|
|
|
|
|
|
def test_chunk_items_by_tokens_keeps_empty_when_filter_empty_false():
|
|
items = ["one", "", "two"]
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=100, filter_empty=False))
|
|
flattened = [i for c in chunks for i in c]
|
|
assert flattened == items
|
|
|
|
|
|
def test_chunk_items_by_tokens_empty_input_yields_nothing():
|
|
assert list(chunk_items_by_tokens([], max_tokens=100)) == []
|
|
# All-empty after filter -> nothing
|
|
assert list(chunk_items_by_tokens(["", " ", None], max_tokens=100)) == []
|
|
|
|
|
|
def test_chunk_items_by_tokens_custom_count_fn():
|
|
"""count_fn can be overridden; verify each item counted as 1."""
|
|
items = ["a", "bb", "ccc", "dddd"]
|
|
chunks = list(chunk_items_by_tokens(items, max_tokens=2, count_fn=lambda _: 1))
|
|
# Each chunk should hold up to 2 items
|
|
assert chunks == [["a", "bb"], ["ccc", "dddd"]]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# remove_substring_names
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_remove_substring_names_empty_input():
|
|
assert remove_substring_names([]) == []
|
|
|
|
|
|
def test_remove_substring_names_drops_substring_of_longer_name():
|
|
"""When a shorter name appears as a whole word in a longer name, drop it."""
|
|
names = ["julia", "julia smith"]
|
|
assert remove_substring_names(names) == ["julia smith"]
|
|
|
|
|
|
def test_remove_substring_names_preserves_original_order():
|
|
"""The function should preserve the input order of surviving names."""
|
|
names = ["julia smith", "bob", "julia"]
|
|
# "julia" is dropped; "julia smith" and "bob" remain in original order
|
|
assert remove_substring_names(names) == ["julia smith", "bob"]
|
|
|
|
|
|
def test_remove_substring_names_does_not_match_partial_words():
|
|
"""'jul' inside 'julia' should not be considered a substring (whole-word match only)."""
|
|
names = ["jul", "julia"]
|
|
# 'jul' is not a whole word inside 'julia', so it should be kept
|
|
result = remove_substring_names(names)
|
|
assert "jul" in result
|
|
assert "julia" in result
|
|
|
|
|
|
def test_remove_substring_names_case_insensitive():
|
|
names = ["JULIA", "Julia Smith"]
|
|
# 'JULIA' is a whole word inside 'Julia Smith' (case-insensitive)
|
|
result = remove_substring_names(names)
|
|
assert result == ["Julia Smith"]
|
|
|
|
|
|
def test_remove_substring_names_skips_blank_entries():
|
|
names = [" ", "alice", "alice cooper"]
|
|
result = remove_substring_names(names)
|
|
assert "alice" not in result
|
|
assert "alice cooper" in result
|
|
assert " " not in result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# select_best_texts_by_keyword
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_select_best_texts_by_keyword_empty_returns_empty():
|
|
assert select_best_texts_by_keyword([], "anything", 100) == []
|
|
|
|
|
|
def test_select_best_texts_by_keyword_no_keyword_returns_input():
|
|
"""Falsy keyword short-circuits and returns the original list."""
|
|
texts = ["one", "two"]
|
|
assert select_best_texts_by_keyword(texts, "", 100) == texts
|
|
|
|
|
|
def test_select_best_texts_by_keyword_filters_texts_without_keyword():
|
|
texts = [
|
|
"Alice walked into the room.",
|
|
"The weather is nice today.",
|
|
"Alice loves to read books.",
|
|
]
|
|
result = select_best_texts_by_keyword(texts, "alice", max_token_length=200)
|
|
# Only texts containing "alice" should remain
|
|
for selected in result:
|
|
assert "alice" in selected.lower()
|
|
assert len(result) == 2
|
|
|
|
|
|
def test_select_best_texts_by_keyword_orders_by_score_desc():
|
|
texts = [
|
|
"Alice mentioned once.",
|
|
"Alice and Alice and Alice all spoke.", # 3 occurrences
|
|
"Alice went to Alice's house.", # 2 occurrences
|
|
]
|
|
result = select_best_texts_by_keyword(texts, "alice", max_token_length=1000)
|
|
# Highest-occurrence text should be first
|
|
assert result[0] == "Alice and Alice and Alice all spoke."
|
|
assert result[1] == "Alice went to Alice's house."
|
|
assert result[2] == "Alice mentioned once."
|
|
|
|
|
|
def test_select_best_texts_by_keyword_respects_token_budget():
|
|
"""Selection stops once the chunk_size budget is filled."""
|
|
texts = [f"alice {i} alice" for i in range(50)]
|
|
# tight budget — should select fewer than all 50
|
|
result = select_best_texts_by_keyword(
|
|
texts, "alice", max_token_length=20, chunk_size_ratio=1.0
|
|
)
|
|
assert 0 < len(result) < 50
|
|
|
|
|
|
def test_select_best_texts_by_keyword_whole_word_match_only():
|
|
"""Substring of another word should not count as a keyword occurrence."""
|
|
texts = [
|
|
"alicewonder is not alice", # 1 whole-word "alice"
|
|
"the cat is here", # 0
|
|
]
|
|
result = select_best_texts_by_keyword(texts, "alice", max_token_length=200)
|
|
assert result == ["alicewonder is not alice"]
|
|
|
|
|
|
def test_select_best_texts_by_keyword_skips_blank_texts():
|
|
# blank/whitespace entries are skipped before scoring
|
|
# Note: None would error in .strip() — verify only non-None blanks are skipped
|
|
texts_clean = ["", " ", "alice spoke"]
|
|
result = select_best_texts_by_keyword(texts_clean, "alice", max_token_length=200)
|
|
assert result == ["alice spoke"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# clean_id
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_clean_id_removes_special_characters():
|
|
assert clean_id("hello!@#$%^&*()world") == "helloworld"
|
|
|
|
|
|
def test_clean_id_preserves_allowed_characters():
|
|
assert clean_id("Foo_Bar-Baz 123") == "Foo_Bar-Baz 123"
|
|
|
|
|
|
def test_clean_id_empty_input():
|
|
assert clean_id("") == ""
|
|
|
|
|
|
def test_clean_id_only_special_characters():
|
|
assert clean_id("@#$%") == ""
|
|
|
|
|
|
def test_clean_id_unicode_dropped():
|
|
"""Unicode characters outside a-zA-Z0-9_- and space are removed."""
|
|
assert clean_id("café 42") == "caf 42"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# slugify
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_slugify_basic_label():
|
|
assert slugify("My vLLM Local") == "my-vllm-local"
|
|
|
|
|
|
def test_slugify_only_separators_returns_empty():
|
|
assert slugify(" ___ ") == ""
|
|
|
|
|
|
def test_slugify_strips_punctuation_runs():
|
|
assert slugify("Voice 1!") == "voice-1"
|
|
|
|
|
|
def test_slugify_collapses_consecutive_specials():
|
|
assert slugify("a b!!!c") == "a-b-c"
|
|
|
|
|
|
def test_slugify_empty_input():
|
|
assert slugify("") == ""
|
|
|
|
|
|
def test_slugify_none_safe():
|
|
"""slugify guards against None input."""
|
|
assert slugify(None) == ""
|
|
|
|
|
|
def test_slugify_already_slug():
|
|
assert slugify("already-a-slug") == "already-a-slug"
|
|
|
|
|
|
def test_slugify_strips_leading_trailing_dashes():
|
|
assert slugify("-foo-bar-") == "foo-bar"
|