Files
talemate/tests/test_utils_data.py
vegu-ai-tools 072cd7fd12 linting
2025-09-27 15:53:37 +03:00

720 lines
23 KiB
Python

import os
import pytest
import json
import yaml
from unittest.mock import MagicMock
import talemate.util.data
from talemate.util.data import (
fix_faulty_json,
extract_json,
extract_json_v2,
extract_yaml_v2,
extract_data_auto,
extract_data,
JSONEncoder,
DataParsingError,
fix_yaml_colon_in_strings,
fix_faulty_yaml,
)
# Helper function to get test data paths
def get_test_data_path(filename):
base_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(base_dir, "data", "util", "data", filename)
@pytest.fixture
def mock_client_and_prompt():
"""Create mock client and prompt for extract_data_auto tests."""
client = MagicMock()
prompt_cls = MagicMock()
# Mock the extract_data_with_ai_fallback to just use extract_data
async def mock_extract_with_ai(client, text, prompt_cls, schema_format):
# Wrap in codeblock format and use existing extract_data
wrapped = f"```{schema_format}\n{text}\n```"
return extract_data(wrapped, schema_format)
# Patch the function during tests
original_func = talemate.util.data.extract_data_with_ai_fallback
talemate.util.data.extract_data_with_ai_fallback = mock_extract_with_ai
yield client, prompt_cls
# Restore original function
talemate.util.data.extract_data_with_ai_fallback = original_func
def test_json_encoder():
"""Test JSONEncoder handles unknown types by converting to string."""
class CustomObject:
def __str__(self):
return "CustomObject"
# Create an object of a custom class
custom_obj = CustomObject()
# Encode it using JSONEncoder
encoded = json.dumps({"obj": custom_obj}, cls=JSONEncoder)
# Check if the object was converted to a string
assert encoded == '{"obj": "CustomObject"}'
def test_fix_faulty_json():
"""Test fix_faulty_json function with various faulty JSON strings."""
# Test adjacent objects - need to wrap in list brackets to make it valid JSON
fixed = fix_faulty_json('{"a": 1}{"b": 2}')
assert fixed == '{"a": 1},{"b": 2}'
# We need to manually wrap it in brackets for the test
assert json.loads("[" + fixed + "]") == [{"a": 1}, {"b": 2}]
# Test trailing commas
assert json.loads(fix_faulty_json('{"a": 1, "b": 2,}')) == {"a": 1, "b": 2}
assert json.loads(fix_faulty_json('{"a": [1, 2, 3,]}')) == {"a": [1, 2, 3]}
def test_extract_json():
"""Test extract_json function to extract JSON from the beginning of a string."""
# Simple test
json_str, obj = extract_json('{"name": "test", "value": 42} and some text')
assert json_str == '{"name": "test", "value": 42}'
assert obj == {"name": "test", "value": 42}
# Test with array
json_str, obj = extract_json("[1, 2, 3] and some text")
assert json_str == "[1, 2, 3]"
assert obj == [1, 2, 3]
# Test with whitespace
json_str, obj = extract_json(' {"name": "test"} and some text')
assert json_str == '{"name": "test"}'
assert obj == {"name": "test"}
# Test with invalid JSON
with pytest.raises(ValueError):
extract_json("This is not JSON")
def test_extract_json_v2_valid():
"""Test extract_json_v2 with valid JSON in code blocks."""
# Load test data
with open(get_test_data_path("valid_json.txt"), "r") as f:
text = f.read()
# Extract JSON
result = extract_json_v2(text)
# Check if we got two unique JSON objects (third is a duplicate)
assert len(result) == 2
# Check if the objects are correct
expected_first = {
"name": "Test Object",
"properties": {"id": 1, "active": True},
"tags": ["test", "json", "parsing"],
}
expected_second = {"name": "Simple Object", "value": 42}
assert expected_first in result
assert expected_second in result
def test_extract_json_v2_invalid():
"""Test extract_json_v2 raises DataParsingError for invalid JSON."""
# Load test data
with open(get_test_data_path("invalid_json.txt"), "r") as f:
text = f.read()
# Try to extract JSON, should raise DataParsingError
with pytest.raises(DataParsingError):
extract_json_v2(text)
def test_extract_json_v2_faulty():
"""Test extract_json_v2 with faulty but fixable JSON."""
# Load test data
with open(get_test_data_path("faulty_json.txt"), "r") as f:
text = f.read()
# Try to extract JSON, should successfully fix and extract some objects
# but might fail on the severely malformed ones
try:
result = extract_json_v2(text)
# If it manages to fix all JSON, verify the results
assert len(result) > 0
except DataParsingError:
# This is also acceptable if some JSON is too broken to fix
pass
def test_data_parsing_error():
"""Test the DataParsingError class."""
# Create a DataParsingError with a message and data
test_data = '{"broken": "json"'
error = DataParsingError("Test error message", test_data)
# Check properties
assert error.message == "Test error message"
assert error.data == test_data
assert str(error) == "Test error message"
def test_extract_json_v2_multiple():
"""Test extract_json_v2 with multiple JSON objects including duplicates."""
# Load test data
with open(get_test_data_path("multiple_json.txt"), "r") as f:
text = f.read()
# Extract JSON
result = extract_json_v2(text)
# Check if we got the correct number of unique objects (3 unique out of 5 total)
assert len(result) == 3
# Define expected objects
expected_objects = [
{"id": 1, "name": "First Object", "tags": ["one", "first", "primary"]},
{"id": 2, "name": "Second Object", "tags": ["two", "second"]},
{
"id": 3,
"name": "Third Object",
"metadata": {"created": "2023-01-01", "version": 1.0},
"active": True,
},
]
# Check if all expected objects are in the result
for expected in expected_objects:
assert expected in result
# Verify that each object appears exactly once (no duplicates)
id_counts = {}
for obj in result:
id_counts[obj["id"]] = id_counts.get(obj["id"], 0) + 1
# Each ID should appear exactly once
for id_val, count in id_counts.items():
assert count == 1, (
f"Object with ID {id_val} appears {count} times (should be 1)"
)
def test_extract_yaml_v2_valid():
"""Test extract_yaml_v2 with valid YAML in code blocks."""
# Load test data
with open(get_test_data_path("valid_yaml.txt"), "r") as f:
text = f.read()
# Extract YAML
result = extract_yaml_v2(text)
# Check if we got two unique YAML objects (third is a duplicate)
assert len(result) == 2
# Check if the objects are correct
expected_first = {
"name": "Test Object",
"properties": {"id": 1, "active": True},
"tags": ["test", "yaml", "parsing"],
}
expected_second = {"simple_name": "Simple Object", "value": 42}
assert expected_first in result
assert expected_second in result
def test_extract_yaml_v2_invalid():
"""Test extract_yaml_v2 raises DataParsingError for invalid YAML."""
# Load test data
with open(get_test_data_path("invalid_yaml.txt"), "r") as f:
text = f.read()
# Try to extract YAML, should raise DataParsingError
with pytest.raises(DataParsingError):
extract_yaml_v2(text)
def test_extract_yaml_v2_multiple():
"""Test extract_yaml_v2 with multiple YAML objects including duplicates."""
# Load test data
with open(get_test_data_path("multiple_yaml.txt"), "r") as f:
text = f.read()
# Extract YAML
result = extract_yaml_v2(text)
# Check if we got the correct number of unique objects (3 unique out of 5 total)
assert len(result) == 3
# Get the objects by ID for easier assertions
objects_by_id = {obj["id"]: obj for obj in result}
# Check for object 1
assert objects_by_id[1]["name"] == "First Object"
assert objects_by_id[1]["tags"] == ["one", "first", "primary"]
# Check for object 2
assert objects_by_id[2]["name"] == "Second Object"
assert objects_by_id[2]["tags"] == ["two", "second"]
# Check for object 3 - note that the date is parsed as a date object by YAML
assert objects_by_id[3]["name"] == "Third Object"
assert objects_by_id[3]["active"] is True
assert "created" in objects_by_id[3]["metadata"]
# Verify that each object ID appears exactly once (no duplicates)
id_counts = {}
for obj in result:
id_counts[obj["id"]] = id_counts.get(obj["id"], 0) + 1
# Each ID should appear exactly once
for id_val, count in id_counts.items():
assert count == 1, (
f"Object with ID {id_val} appears {count} times (should be 1)"
)
def test_extract_yaml_v2_multiple_documents():
"""Test extract_yaml_v2 with multiple YAML documents in a single code block."""
# Load test data from file
with open(get_test_data_path("multiple_yaml_documents.txt"), "r") as f:
test_data = f.read()
# Extract YAML
result = extract_yaml_v2(test_data)
# Check if we got all three documents
assert len(result) == 3
# Check if the objects are correct
objects_by_id = {obj["id"]: obj for obj in result}
assert objects_by_id[1]["name"] == "First Document"
assert "first" in objects_by_id[1]["tags"]
assert objects_by_id[2]["name"] == "Second Document"
assert "secondary" in objects_by_id[2]["tags"]
assert objects_by_id[3]["name"] == "Third Document"
assert objects_by_id[3]["active"] is True
def test_extract_yaml_v2_without_separators():
"""Test extract_yaml_v2 with multiple YAML documents without --- separators."""
# Load test data from file
with open(get_test_data_path("multiple_yaml_without_separators.txt"), "r") as f:
test_data = f.read()
# Extract YAML
result = extract_yaml_v2(test_data)
# Check if we got all three nested documents
assert len(result) == 3
# Create a dictionary of documents by name for easy testing
docs_by_name = {doc["name"]: doc for doc in result}
# Verify that all three documents are correctly parsed
assert "First Document" in docs_by_name
assert docs_by_name["First Document"]["id"] == 1
assert "first" in docs_by_name["First Document"]["tags"]
assert "Second Document" in docs_by_name
assert docs_by_name["Second Document"]["id"] == 2
assert "secondary" in docs_by_name["Second Document"]["tags"]
assert "Third Document" in docs_by_name
assert docs_by_name["Third Document"]["id"] == 3
assert docs_by_name["Third Document"]["active"] is True
def test_extract_json_v2_multiple_objects():
"""Test extract_json_v2 with multiple JSON objects in a single code block."""
# Load test data from file
with open(get_test_data_path("multiple_json_objects.txt"), "r") as f:
test_data = f.read()
# Extract JSON
result = extract_json_v2(test_data)
# Check if we got all three objects
assert len(result) == 3
# Check if the objects are correct
objects_by_id = {obj["id"]: obj for obj in result}
assert objects_by_id[1]["name"] == "First Object"
assert objects_by_id[1]["type"] == "test"
assert objects_by_id[2]["name"] == "Second Object"
assert objects_by_id[2]["values"] == [1, 2, 3]
assert objects_by_id[3]["name"] == "Third Object"
assert objects_by_id[3]["active"] is True
assert objects_by_id[3]["metadata"]["created"] == "2023-05-15"
def test_fix_yaml_colon_in_strings():
"""Test fix_yaml_colon_in_strings with problematic YAML containing unquoted colons."""
# Load test data from file
with open(get_test_data_path("yaml_with_colons.txt"), "r") as f:
problematic_yaml = f.read()
# Extract YAML from the code block
problematic_yaml = problematic_yaml.split("```")[1]
if problematic_yaml.startswith("yaml"):
problematic_yaml = problematic_yaml[4:].strip()
# Fix the YAML
fixed_yaml = fix_yaml_colon_in_strings(problematic_yaml)
# Parse the fixed YAML to check it works
parsed = yaml.safe_load(fixed_yaml)
# Check the structure and content is preserved
assert parsed["calls"][0]["name"] == "act"
assert parsed["calls"][0]["arguments"]["name"] == "Kaira"
assert (
"I can see you're scared, Elmer"
in parsed["calls"][0]["arguments"]["instructions"]
)
def test_fix_faulty_yaml():
"""Test fix_faulty_yaml with various problematic YAML constructs."""
# Load test data from file
with open(get_test_data_path("yaml_list_with_colons.txt"), "r") as f:
problematic_yaml = f.read()
# Extract YAML from the code block
problematic_yaml = problematic_yaml.split("```")[1]
if problematic_yaml.startswith("yaml"):
problematic_yaml = problematic_yaml[4:].strip()
# Fix the YAML
fixed_yaml = fix_faulty_yaml(problematic_yaml)
# Parse the fixed YAML to check it works
parsed = yaml.safe_load(fixed_yaml)
# Check the structure and content is preserved
assert len(parsed["instructions_list"]) == 2
# The content will be the full string with colons in it now
assert "Run to the door" in parsed["instructions_list"][0]
assert "Wait for me!" in parsed["instructions_list"][0]
assert "Look around" in parsed["instructions_list"][1]
assert "Is there another way out?" in parsed["instructions_list"][1]
def test_extract_yaml_v2_with_colons():
"""Test extract_yaml_v2 correctly processes YAML with problematic colons in strings."""
# Load test data containing YAML code blocks with problematic colons
with open(get_test_data_path("yaml_block_with_colons.txt"), "r") as f:
text = f.read()
# Extract YAML
result = extract_yaml_v2(text)
# Check if we got the two YAML objects
assert len(result) == 2
# Find the objects by their structure
calls_obj = None
instructions_obj = None
for obj in result:
if "calls" in obj:
calls_obj = obj
elif "instructions_list" in obj:
instructions_obj = obj
# Verify both objects were found
assert calls_obj is not None, "Could not find the 'calls' object"
assert instructions_obj is not None, "Could not find the 'instructions_list' object"
# Check the structure and content of the first object (calls)
assert calls_obj["calls"][0]["name"] == "act"
assert calls_obj["calls"][0]["arguments"]["name"] == "Kaira"
# Check that the problematic part with the colon is preserved
instructions = calls_obj["calls"][0]["arguments"]["instructions"]
assert "Speak in a calm, soothing tone and say:" in instructions
assert "I can see you're scared, Elmer" in instructions
# Check the second object (instructions_list)
assert len(instructions_obj["instructions_list"]) == 2
assert "Run to the door" in instructions_obj["instructions_list"][0]
assert "Wait for me!" in instructions_obj["instructions_list"][0]
assert "Look around" in instructions_obj["instructions_list"][1]
assert "Is there another way out?" in instructions_obj["instructions_list"][1]
@pytest.mark.asyncio
async def test_extract_data_auto_mixed_formats(mock_client_and_prompt):
"""Test extract_data_auto with mixed JSON and YAML codeblocks."""
client, prompt_cls = mock_client_and_prompt
# Load test data
with open(get_test_data_path("mixed_formats.txt"), "r") as f:
mixed_text = f.read()
result = await extract_data_auto(mixed_text, client, prompt_cls)
# Should extract all three objects
assert len(result) == 3
# Verify objects by ID
objects_by_id = {obj["id"]: obj for obj in result}
assert objects_by_id[1]["name"] == "JSON Object"
assert objects_by_id[1]["type"] == "json"
assert objects_by_id[2]["name"] == "YAML Object"
assert objects_by_id[2]["type"] == "yaml"
assert "test" in objects_by_id[2]["tags"]
assert objects_by_id[3]["name"] == "Second JSON"
assert objects_by_id[3]["active"] is True
@pytest.mark.asyncio
async def test_extract_data_auto_untyped_codeblocks(mock_client_and_prompt):
"""Test extract_data_auto with untyped codeblocks using default format."""
# Test with JSON default
with open(get_test_data_path("untyped_codeblocks_json.txt"), "r") as f:
json_text = f.read()
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(
json_text, client, prompt_cls, schema_format="json"
)
assert len(result) == 2
names = {obj["name"] for obj in result}
assert "Untyped JSON" in names
assert "Another JSON" in names
# Test with YAML default
with open(get_test_data_path("untyped_codeblocks_yaml.txt"), "r") as f:
yaml_text = f.read()
result = await extract_data_auto(
yaml_text, client, prompt_cls, schema_format="yaml"
)
assert len(result) == 2
names = {obj["name"] for obj in result}
assert "Untyped YAML" in names
assert "Another YAML" in names
@pytest.mark.asyncio
async def test_extract_data_auto_bare_codeblock(mock_client_and_prompt):
"""Test extract_data_auto with entire text being just a codeblock."""
# JSON codeblock
json_codeblock = """```json
{"name": "Bare JSON", "id": 123, "active": true}
```"""
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(json_codeblock, client, prompt_cls)
assert len(result) == 1
assert result[0]["name"] == "Bare JSON"
assert result[0]["id"] == 123
# YAML codeblock
yaml_codeblock = """```yaml
name: Bare YAML
id: 456
active: false
tags:
- bare
- yaml
```"""
result = await extract_data_auto(yaml_codeblock, client, prompt_cls)
assert len(result) == 1
assert result[0]["name"] == "Bare YAML"
assert result[0]["id"] == 456
assert "bare" in result[0]["tags"]
@pytest.mark.asyncio
async def test_extract_data_auto_raw_data(mock_client_and_prompt):
"""Test extract_data_auto with raw data structures (no codeblocks)."""
# Raw JSON
raw_json = '{"name": "Raw JSON", "value": 100}'
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(raw_json, client, prompt_cls, schema_format="json")
assert len(result) == 1
assert result[0]["name"] == "Raw JSON"
assert result[0]["value"] == 100
# Raw YAML
raw_yaml = """name: Raw YAML
value: 200
metadata:
created: 2023-01-01
version: 1.0"""
result = await extract_data_auto(raw_yaml, client, prompt_cls, schema_format="yaml")
assert len(result) == 1
assert result[0]["name"] == "Raw YAML"
assert result[0]["value"] == 200
# YAML parser converts date strings to date objects
assert str(result[0]["metadata"]["created"]) == "2023-01-01"
@pytest.mark.asyncio
async def test_extract_data_auto_empty_codeblocks(mock_client_and_prompt):
"""Test extract_data_auto skips empty codeblocks."""
# Load test data
with open(get_test_data_path("empty_codeblocks.txt"), "r") as f:
text_with_empty = f.read()
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(text_with_empty, client, prompt_cls)
assert len(result) == 2
objects_by_id = {obj["id"]: obj for obj in result}
assert objects_by_id[1]["name"] == "Valid"
assert objects_by_id[2]["name"] == "Valid YAML"
@pytest.mark.asyncio
async def test_extract_data_auto_malformed_blocks(mock_client_and_prompt):
"""Test extract_data_auto handles malformed blocks gracefully."""
text_with_malformed = """
Valid JSON:
```json
{"name": "Valid", "id": 1}
```
Malformed JSON:
```json
{"name": "Broken", "id":
```
Another valid JSON:
```json
{"name": "Also Valid", "id": 2}
```
"""
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(text_with_malformed, client, prompt_cls)
# Should extract the 2 valid objects and skip the malformed one
assert len(result) == 2
names = {obj["name"] for obj in result}
assert "Valid" in names
assert "Also Valid" in names
assert "Broken" not in names # Should be skipped
@pytest.mark.asyncio
async def test_extract_data_auto_repairs_faulty_json(mock_client_and_prompt):
"""Test extract_data_auto can repair faulty JSON blocks."""
# Load test data
with open(get_test_data_path("faulty_json_repairable.txt"), "r") as f:
text_with_faulty = f.read()
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(text_with_faulty, client, prompt_cls)
# Should successfully repair and extract both objects
assert len(result) == 3 # Two from first block (after repair), one from second
# Check that repair worked
names = {obj["name"] for obj in result if "name" in obj}
assert "Test" in names
assert "Another" in names
@pytest.mark.asyncio
async def test_extract_data_auto_yml_identifier(mock_client_and_prompt):
"""Test extract_data_auto recognizes 'yml' as YAML identifier."""
yml_text = """
Data with yml extension:
```yml
name: YML Test
id: 123
config:
enabled: true
timeout: 30
```
"""
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(yml_text, client, prompt_cls)
assert len(result) == 1
assert result[0]["name"] == "YML Test"
assert result[0]["id"] == 123
assert result[0]["config"]["enabled"] is True
@pytest.mark.asyncio
async def test_extract_data_auto_invalid_raw_data(mock_client_and_prompt):
"""Test extract_data_auto raises DataParsingError for invalid raw data."""
# Invalid raw JSON
invalid_json = '{"name": "Broken JSON", "id":'
with pytest.raises(DataParsingError) as exc_info:
client, prompt_cls = mock_client_and_prompt
await extract_data_auto(invalid_json, client, prompt_cls, schema_format="json")
assert "Failed to parse raw JSON data" in str(exc_info.value)
# Invalid raw YAML
invalid_yaml = """name: Broken YAML
- invalid: structure
without: proper indentation"""
with pytest.raises(DataParsingError) as exc_info:
await extract_data_auto(invalid_yaml, client, prompt_cls, schema_format="yaml")
assert "Failed to parse raw YAML data" in str(exc_info.value)
@pytest.mark.asyncio
async def test_extract_data_auto_unsupported_format(mock_client_and_prompt):
"""Test extract_data_auto raises DataParsingError for unsupported formats."""
text = '{"name": "test"}'
with pytest.raises(DataParsingError) as exc_info:
client, prompt_cls = mock_client_and_prompt
await extract_data_auto(text, client, prompt_cls, schema_format="xml")
assert "Failed to parse raw XML data" in str(exc_info.value)
@pytest.mark.asyncio
async def test_extract_data_auto_multiple_objects_in_single_block(
mock_client_and_prompt,
):
"""Test extract_data_auto handles multiple objects within a single codeblock."""
multiple_json = """
```json
{"id": 1, "name": "First"}
{"id": 2, "name": "Second"}
{"id": 3, "name": "Third"}
```
"""
client, prompt_cls = mock_client_and_prompt
result = await extract_data_auto(multiple_json, client, prompt_cls)
assert len(result) == 3
objects_by_id = {obj["id"]: obj for obj in result}
assert objects_by_id[1]["name"] == "First"
assert objects_by_id[2]["name"] == "Second"
assert objects_by_id[3]["name"] == "Third"