diff --git a/.github/actions/spell-check/expect.txt b/.github/actions/spell-check/expect.txt index 1f450a10e6..7c1b9f65dd 100644 --- a/.github/actions/spell-check/expect.txt +++ b/.github/actions/spell-check/expect.txt @@ -2180,4 +2180,4 @@ Zoneszonabletester Zoomin zoomit ZOOMITX -Zorder \ No newline at end of file +Zorder diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs index 11c3113dac..99a6af73af 100644 --- a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs @@ -117,7 +117,11 @@ public class FuzzyMatcherComparisonTests ["_a", "_a"], ["a_", "a_"], ["-a", "-a"], - ["a-", "a-"] + ["a-", "a-"], + ["🐿️", "🐿️"], // Squirrel emoji + ["\U0001F44D", "\U0001F44D\U0001F3FB"], // Base thumbs-up vs thumbs-up with LIGHT skin tone modifier + ["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FB"], // Thumbs-up with LIGHT skin tone vs itself (exact same sequence) + ["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FF"], // Thumbs-up with LIGHT skin tone vs thumbs-up with DARK skin tone ]; [TestMethod] diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs new file mode 100644 index 0000000000..f418402aed --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherComplexEmojiTests +{ + [TestMethod] + [Ignore("For now this is not supported")] + public void Mismatch_DifferentSkinTone_PartialMatch() + { + // "👍🏻" (Light) vs "👍🏿" (Dark) + // They share the base "👍". + const string needle = "👍🏻"; + const string haystack = "👍🏿"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + // Should have a positive score because of the base emoji match + Assert.IsTrue(result.Score > 0, "Expected partial match based on base emoji"); + + // Should match the base emoji (2 chars) + Assert.AreEqual(2, result.Positions.Count, "Expected match on base emoji only"); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs new file mode 100644 index 0000000000..623325f3fc --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherEmojiTests +{ + [TestMethod] + public void ExactMatch_SimpleEmoji_ReturnsScore() + { + const string needle = "🚀"; + const string haystack = "Launch 🚀 sequence"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for simple emoji"); + + // 🚀 is 2 chars (surrogates) + Assert.AreEqual(2, result.Positions.Count, "Expected 2 matched characters positions for the emoji"); + } + + [TestMethod] + public void ExactMatch_SkinTone_ReturnsScore() + { + const string needle = "👍🏽"; // Medium skin tone + const string haystack = "Thumbs up 👍🏽 here"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for emoji with skin tone"); + + // 👍🏽 is 4 chars: U+1F44D (2 chars) + U+1F3FD (2 chars) + Assert.AreEqual(4, result.Positions.Count, "Expected 4 matched characters positions for the emoji with modifier"); + } + + [TestMethod] + public void ZWJSequence_Family_Match() + { + const string needle = "👨‍👩‍👧‍👦"; // Family: Man, Woman, Girl, Boy + const string haystack = "Emoji 👨‍👩‍👧‍👦 Test"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for ZWJ sequence"); + + // This emoji is 11 code points? No. + // Man (2) + ZWJ (1) + Woman (2) + ZWJ (1) + Girl (2) + ZWJ (1) + Boy (2) = 11 chars? + // Let's just check score > 0. + Assert.IsTrue(result.Positions.Count > 0); + } + + [TestMethod] + public void Flags_Match() + { + const string needle = "🇺🇸"; // US Flag (Regional Indicator U + Regional Indicator S) + const string haystack = "USA 🇺🇸"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for flag emoji"); + + // 2 code points, each is surrogate pair? + // U+1F1FA (REGIONAL INDICATOR SYMBOL LETTER U) -> 2 chars + // U+1F1F8 (REGIONAL INDICATOR SYMBOL LETTER S) -> 2 chars + // Total 4 chars. + Assert.AreEqual(4, result.Positions.Count); + } + + [TestMethod] + public void Emoji_MixedWithText_Search() + { + const string needle = "t🌮o"; // "t" + taco + "o" + const string haystack = "taco 🌮 on tuesday"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs new file mode 100644 index 0000000000..ccc5174f00 --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherNormalizationTests +{ + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_GermanEszett() + { + // "Straße" (6 chars) + // Standard "SS" expansion would change length to 7. + // Our normalizer must preserve length. + var input = "Straße"; + var expectedLength = input.Length; + + // Case 1: Remove Diacritics = true + var normalized = Fold(input, removeDiacritics: true); + Assert.AreEqual(expectedLength, normalized.Length, "Normalization (removeDiacritics=true) must be length preserving for 'Straße'"); + + // Verify expected mapping: ß -> ß (length 1) + Assert.AreEqual("STRAßE", normalized); + + // Case 2: Remove Diacritics = false + var normalizedKeep = Fold(input, removeDiacritics: false); + Assert.AreEqual(expectedLength, normalizedKeep.Length, "Normalization (removeDiacritics=false) must be length preserving for 'Straße'"); + + // ß maps to ß in invariant culture (length 1) + Assert.AreEqual("STRAßE", normalizedKeep); + } + + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_CommonDiacritics() + { + var input = "Crème Brûlée"; + var expected = "CREME BRULEE"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_MixedComposed() + { + // "Ångström" -> A + ring, o + umlaut /* #no-spell-check-line */ + var input = "Ångström"; /* #no-spell-check-line */ + var expected = "ANGSTROM"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + [TestMethod] + public void Normalization_ShouldNormalizeSlashes() + { + var input = @"Folder\File.txt"; + var expected = "FOLDER/FILE.TXT"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + private string Fold(string input, bool removeDiacritics) + { + return FuzzyStringMatcher.Folding.FoldForComparison(input, removeDiacritics); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs new file mode 100644 index 0000000000..4532f19b71 --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs @@ -0,0 +1,223 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherUnicodeGarbageTests +{ + [TestMethod] + public void UnpairedHighSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "\uD83D"; // high surrogate (unpaired) + const string haystack = "abc"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedLowSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "\uDC00"; // low surrogate (unpaired) + const string haystack = "abc"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedHighSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "a\uD83D" + "bc"; // inject unpaired high surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedLowSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "a\uDC00" + "bc"; // inject unpaired low surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void MixedSurrogatesAndMarks_RemoveDiacritics_ShouldNotThrow() + { + // "Garbage smoothie": unpaired surrogate + combining mark + emoji surrogate pair + const string needle = "a\uD83D\u0301"; // 'a' + unpaired high surrogate + combining acute + const string haystack = "a\u0301 \U0001F600"; // 'a' + combining acute + space + 😀 (valid pair) + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void ValidEmojiSurrogatePair_RemoveDiacritics_ShouldNotThrow_AndCanMatch() + { + // 😀 U+1F600 encoded as surrogate pair in UTF-16 + const string needle = "\U0001F600"; + const string haystack = "x \U0001F600 y"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + + // Keep assertions minimal: just ensure it doesn't act like "no match". + // If your API returns score=0 for no match, this is stable. + Assert.IsTrue(result.Score > 0, "Expected emoji to produce a match score > 0."); + Assert.IsTrue(result.Positions.Count > 0, "Expected at least one matched position."); + } + + [TestMethod] + public void DiacriticStripping_StillWorks_OnBMPNonSurrogate() + { + // This is a regression guard: we fixed surrogates; don't break diacritic stripping. + // "é" should fold like "e" when removeDiacritics=true. + const string needle = "cafe"; + const string haystack = "CAFÉ"; + + var withDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + + var withoutDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: false); + + Assert.IsTrue(withDiacriticsRemoved.Score >= withoutDiacriticsRemoved.Score, "Removing diacritics should not make matching worse for 'CAFÉ' vs 'cafe'."); + Assert.IsTrue(withDiacriticsRemoved.Score > 0, "Expected a match when diacritics are removed."); + } + + [TestMethod] + public void RandomUtf16Garbage_RemoveDiacritics_ShouldNotThrow() + { + // Deterministic pseudo-random "UTF-16 garbage", including surrogates. + // This is a quick fuzz-lite test that’s stable across runs. + var s1 = MakeDeterministicGarbage(seed: 1234, length: 512); + var s2 = MakeDeterministicGarbage(seed: 5678, length: 1024); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + s1, + s2, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void RandomUtf16Garbage_NoDiacritics_ShouldNotThrow() + { + var s1 = MakeDeterministicGarbage(seed: 42, length: 512); + var s2 = MakeDeterministicGarbage(seed: 43, length: 1024); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + s1, + s2, + allowNonContiguousMatches: true, + removeDiacritics: false); + } + + [TestMethod] + public void HighSurrogateAtEndOfHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "abc\uD83D"; // Ends with high surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void ComplexEmojiSequence_RemoveDiacritics_ShouldNotThrow() + { + // Family: Man, Woman, Girl, Boy + // U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 + const string needle = "\U0001F468"; + const string haystack = "Info: \U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466 family"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void NullOrEmptyInputs_ShouldNotThrow() + { + // Empty needle + var result1 = FuzzyStringMatcher.ScoreFuzzyWithPositions(string.Empty, "abc", true, true); + Assert.AreEqual(0, result1.Score); + + // Empty haystack + var result2 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", string.Empty, true, true); + Assert.AreEqual(0, result2.Score); + + // Null haystack + var result3 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", null!, true, true); + Assert.AreEqual(0, result3.Score); + } + + [TestMethod] + public void VeryLongStrings_ShouldNotThrow() + { + var needle = new string('a', 100); + var haystack = new string('b', 10000) + needle + new string('c', 10000); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + private static string MakeDeterministicGarbage(int seed, int length) + { + // LCG for deterministic generation without Random’s platform/version surprises. + var x = (uint)seed; + var chars = length <= 2048 ? stackalloc char[length] : new char[length]; + + for (var i = 0; i < chars.Length; i++) + { + // LCG: x = (a*x + c) mod 2^32 + x = unchecked((1664525u * x) + 1013904223u); + + // Take top 16 bits as UTF-16 code unit (includes surrogates). + chars[i] = (char)(x >> 16); + } + + return new string(chars); + } +} diff --git a/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs b/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs index 40491970b3..a4b7084555 100644 --- a/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs +++ b/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs @@ -643,7 +643,7 @@ public static class FuzzyStringMatcher // ============================================================ // Folding: slash normalization + upper case + optional diacritics stripping - private static class Folding + internal static class Folding { // Cache maps an upper case char to its diacritics-stripped upper case char. // '\0' means "not cached yet". @@ -820,6 +820,13 @@ public static class FuzzyStringMatcher return upper; } + // Emoji and other astral symbols come through as surrogate pairs in UTF-16. + // We process char-by-char, so never try to normalize a lone surrogate. + if (char.IsSurrogate(upper)) + { + return upper; + } + var cached = StripCacheUpper[upper]; if (cached != '\0') {