From 49cc504d94f65651efeb81d2f324e4d46e3bb838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Pol=C3=A1=C5=A1ek?= Date: Mon, 2 Feb 2026 19:30:00 +0100 Subject: [PATCH] CmdPal: Improve fuzzy matcher Unicode and emoji robustness (#45275) ## Summary of the Pull Request Add comprehensive unit tests for emoji, ZWJ sequences, skin tone modifiers, and UTF-16 edge cases (unpaired surrogates, combining marks, random garbage). Update matcher logic to skip normalization of lone surrogates, preventing errors with malformed Unicode. Expand comparison test data to cover emoji scenarios. Adds regression guards for diacritic handling and surrogate processing. Fixes #45246 introduced in #44809. ## PR Checklist - [x] Closes: #45246 - [ ] **Communication:** I've discussed this with core contributors already. If the work hasn't been agreed, this work might be rejected - [ ] **Tests:** Added/updated and all pass - [ ] **Localization:** All end-user-facing strings can be localized - [ ] **Dev docs:** Added/updated - [ ] **New binaries:** Added on the required places - [ ] [JSON for signing](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ESRPSigning_core.json) for new binaries - [ ] [WXS for installer](https://github.com/microsoft/PowerToys/blob/main/installer/PowerToysSetup/Product.wxs) for new binaries and localization folder - [ ] [YML for CI pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ci/templates/build-powertoys-steps.yml) for new test projects - [ ] [YML for signed pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/release.yml) - [ ] **Documentation updated:** If checked, please file a pull request on [our docs repo](https://github.com/MicrosoftDocs/windows-uwp/tree/docs/hub/powertoys) and link it here: #xxx ## Detailed Description of the Pull Request / Additional comments ## Validation Steps Performed --- .github/actions/spell-check/expect.txt | 2 +- .../FuzzyMatcherComparisonTests.cs | 6 +- .../FuzzyMatcherComplexEmojiTests.cs | 29 +++ .../FuzzyMatcherEmojiTests.cs | 83 +++++++ .../FuzzyMatcherNormalizationTests.cs | 78 ++++++ .../FuzzyMatcherUnicodeGarbageTests.cs | 223 ++++++++++++++++++ .../FuzzyStringMatcher.cs | 9 +- 7 files changed, 427 insertions(+), 3 deletions(-) create mode 100644 src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs create mode 100644 src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs create mode 100644 src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs create mode 100644 src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs diff --git a/.github/actions/spell-check/expect.txt b/.github/actions/spell-check/expect.txt index 1f450a10e6..7c1b9f65dd 100644 --- a/.github/actions/spell-check/expect.txt +++ b/.github/actions/spell-check/expect.txt @@ -2180,4 +2180,4 @@ Zoneszonabletester Zoomin zoomit ZOOMITX -Zorder \ No newline at end of file +Zorder diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs index 11c3113dac..99a6af73af 100644 --- a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComparisonTests.cs @@ -117,7 +117,11 @@ public class FuzzyMatcherComparisonTests ["_a", "_a"], ["a_", "a_"], ["-a", "-a"], - ["a-", "a-"] + ["a-", "a-"], + ["🐿️", "🐿️"], // Squirrel emoji + ["\U0001F44D", "\U0001F44D\U0001F3FB"], // Base thumbs-up vs thumbs-up with LIGHT skin tone modifier + ["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FB"], // Thumbs-up with LIGHT skin tone vs itself (exact same sequence) + ["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FF"], // Thumbs-up with LIGHT skin tone vs thumbs-up with DARK skin tone ]; [TestMethod] diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs new file mode 100644 index 0000000000..f418402aed --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherComplexEmojiTests.cs @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherComplexEmojiTests +{ + [TestMethod] + [Ignore("For now this is not supported")] + public void Mismatch_DifferentSkinTone_PartialMatch() + { + // "👍🏻" (Light) vs "👍🏿" (Dark) + // They share the base "👍". + const string needle = "👍🏻"; + const string haystack = "👍🏿"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + // Should have a positive score because of the base emoji match + Assert.IsTrue(result.Score > 0, "Expected partial match based on base emoji"); + + // Should match the base emoji (2 chars) + Assert.AreEqual(2, result.Positions.Count, "Expected match on base emoji only"); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs new file mode 100644 index 0000000000..623325f3fc --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherEmojiTests.cs @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherEmojiTests +{ + [TestMethod] + public void ExactMatch_SimpleEmoji_ReturnsScore() + { + const string needle = "🚀"; + const string haystack = "Launch 🚀 sequence"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for simple emoji"); + + // 🚀 is 2 chars (surrogates) + Assert.AreEqual(2, result.Positions.Count, "Expected 2 matched characters positions for the emoji"); + } + + [TestMethod] + public void ExactMatch_SkinTone_ReturnsScore() + { + const string needle = "👍🏽"; // Medium skin tone + const string haystack = "Thumbs up 👍🏽 here"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for emoji with skin tone"); + + // 👍🏽 is 4 chars: U+1F44D (2 chars) + U+1F3FD (2 chars) + Assert.AreEqual(4, result.Positions.Count, "Expected 4 matched characters positions for the emoji with modifier"); + } + + [TestMethod] + public void ZWJSequence_Family_Match() + { + const string needle = "👨‍👩‍👧‍👦"; // Family: Man, Woman, Girl, Boy + const string haystack = "Emoji 👨‍👩‍👧‍👦 Test"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for ZWJ sequence"); + + // This emoji is 11 code points? No. + // Man (2) + ZWJ (1) + Woman (2) + ZWJ (1) + Girl (2) + ZWJ (1) + Boy (2) = 11 chars? + // Let's just check score > 0. + Assert.IsTrue(result.Positions.Count > 0); + } + + [TestMethod] + public void Flags_Match() + { + const string needle = "🇺🇸"; // US Flag (Regional Indicator U + Regional Indicator S) + const string haystack = "USA 🇺🇸"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0, "Expected match for flag emoji"); + + // 2 code points, each is surrogate pair? + // U+1F1FA (REGIONAL INDICATOR SYMBOL LETTER U) -> 2 chars + // U+1F1F8 (REGIONAL INDICATOR SYMBOL LETTER S) -> 2 chars + // Total 4 chars. + Assert.AreEqual(4, result.Positions.Count); + } + + [TestMethod] + public void Emoji_MixedWithText_Search() + { + const string needle = "t🌮o"; // "t" + taco + "o" + const string haystack = "taco 🌮 on tuesday"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true); + + Assert.IsTrue(result.Score > 0); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs new file mode 100644 index 0000000000..ccc5174f00 --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherNormalizationTests.cs @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherNormalizationTests +{ + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_GermanEszett() + { + // "Straße" (6 chars) + // Standard "SS" expansion would change length to 7. + // Our normalizer must preserve length. + var input = "Straße"; + var expectedLength = input.Length; + + // Case 1: Remove Diacritics = true + var normalized = Fold(input, removeDiacritics: true); + Assert.AreEqual(expectedLength, normalized.Length, "Normalization (removeDiacritics=true) must be length preserving for 'Straße'"); + + // Verify expected mapping: ß -> ß (length 1) + Assert.AreEqual("STRAßE", normalized); + + // Case 2: Remove Diacritics = false + var normalizedKeep = Fold(input, removeDiacritics: false); + Assert.AreEqual(expectedLength, normalizedKeep.Length, "Normalization (removeDiacritics=false) must be length preserving for 'Straße'"); + + // ß maps to ß in invariant culture (length 1) + Assert.AreEqual("STRAßE", normalizedKeep); + } + + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_CommonDiacritics() + { + var input = "Crème Brûlée"; + var expected = "CREME BRULEE"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + [TestMethod] + public void Normalization_ShouldBeLengthPreserving_MixedComposed() + { + // "Ångström" -> A + ring, o + umlaut /* #no-spell-check-line */ + var input = "Ångström"; /* #no-spell-check-line */ + var expected = "ANGSTROM"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + [TestMethod] + public void Normalization_ShouldNormalizeSlashes() + { + var input = @"Folder\File.txt"; + var expected = "FOLDER/FILE.TXT"; + + var normalized = Fold(input, removeDiacritics: true); + + Assert.AreEqual(input.Length, normalized.Length); + Assert.AreEqual(expected, normalized); + } + + private string Fold(string input, bool removeDiacritics) + { + return FuzzyStringMatcher.Folding.FoldForComparison(input, removeDiacritics); + } +} diff --git a/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs new file mode 100644 index 0000000000..4532f19b71 --- /dev/null +++ b/src/modules/cmdpal/Tests/Microsoft.CommandPalette.Extensions.Toolkit.UnitTests/FuzzyMatcherUnicodeGarbageTests.cs @@ -0,0 +1,223 @@ +// Copyright (c) Microsoft Corporation +// The Microsoft Corporation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests; + +[TestClass] +public sealed class FuzzyMatcherUnicodeGarbageTests +{ + [TestMethod] + public void UnpairedHighSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "\uD83D"; // high surrogate (unpaired) + const string haystack = "abc"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedLowSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "\uDC00"; // low surrogate (unpaired) + const string haystack = "abc"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedHighSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "a\uD83D" + "bc"; // inject unpaired high surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void UnpairedLowSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "a\uDC00" + "bc"; // inject unpaired low surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void MixedSurrogatesAndMarks_RemoveDiacritics_ShouldNotThrow() + { + // "Garbage smoothie": unpaired surrogate + combining mark + emoji surrogate pair + const string needle = "a\uD83D\u0301"; // 'a' + unpaired high surrogate + combining acute + const string haystack = "a\u0301 \U0001F600"; // 'a' + combining acute + space + 😀 (valid pair) + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void ValidEmojiSurrogatePair_RemoveDiacritics_ShouldNotThrow_AndCanMatch() + { + // 😀 U+1F600 encoded as surrogate pair in UTF-16 + const string needle = "\U0001F600"; + const string haystack = "x \U0001F600 y"; + + var result = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + + // Keep assertions minimal: just ensure it doesn't act like "no match". + // If your API returns score=0 for no match, this is stable. + Assert.IsTrue(result.Score > 0, "Expected emoji to produce a match score > 0."); + Assert.IsTrue(result.Positions.Count > 0, "Expected at least one matched position."); + } + + [TestMethod] + public void DiacriticStripping_StillWorks_OnBMPNonSurrogate() + { + // This is a regression guard: we fixed surrogates; don't break diacritic stripping. + // "é" should fold like "e" when removeDiacritics=true. + const string needle = "cafe"; + const string haystack = "CAFÉ"; + + var withDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + + var withoutDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: false); + + Assert.IsTrue(withDiacriticsRemoved.Score >= withoutDiacriticsRemoved.Score, "Removing diacritics should not make matching worse for 'CAFÉ' vs 'cafe'."); + Assert.IsTrue(withDiacriticsRemoved.Score > 0, "Expected a match when diacritics are removed."); + } + + [TestMethod] + public void RandomUtf16Garbage_RemoveDiacritics_ShouldNotThrow() + { + // Deterministic pseudo-random "UTF-16 garbage", including surrogates. + // This is a quick fuzz-lite test that’s stable across runs. + var s1 = MakeDeterministicGarbage(seed: 1234, length: 512); + var s2 = MakeDeterministicGarbage(seed: 5678, length: 1024); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + s1, + s2, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void RandomUtf16Garbage_NoDiacritics_ShouldNotThrow() + { + var s1 = MakeDeterministicGarbage(seed: 42, length: 512); + var s2 = MakeDeterministicGarbage(seed: 43, length: 1024); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + s1, + s2, + allowNonContiguousMatches: true, + removeDiacritics: false); + } + + [TestMethod] + public void HighSurrogateAtEndOfHaystack_RemoveDiacritics_ShouldNotThrow() + { + const string needle = "a"; + const string haystack = "abc\uD83D"; // Ends with high surrogate + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void ComplexEmojiSequence_RemoveDiacritics_ShouldNotThrow() + { + // Family: Man, Woman, Girl, Boy + // U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 + const string needle = "\U0001F468"; + const string haystack = "Info: \U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466 family"; + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + [TestMethod] + public void NullOrEmptyInputs_ShouldNotThrow() + { + // Empty needle + var result1 = FuzzyStringMatcher.ScoreFuzzyWithPositions(string.Empty, "abc", true, true); + Assert.AreEqual(0, result1.Score); + + // Empty haystack + var result2 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", string.Empty, true, true); + Assert.AreEqual(0, result2.Score); + + // Null haystack + var result3 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", null!, true, true); + Assert.AreEqual(0, result3.Score); + } + + [TestMethod] + public void VeryLongStrings_ShouldNotThrow() + { + var needle = new string('a', 100); + var haystack = new string('b', 10000) + needle + new string('c', 10000); + + _ = FuzzyStringMatcher.ScoreFuzzyWithPositions( + needle, + haystack, + allowNonContiguousMatches: true, + removeDiacritics: true); + } + + private static string MakeDeterministicGarbage(int seed, int length) + { + // LCG for deterministic generation without Random’s platform/version surprises. + var x = (uint)seed; + var chars = length <= 2048 ? stackalloc char[length] : new char[length]; + + for (var i = 0; i < chars.Length; i++) + { + // LCG: x = (a*x + c) mod 2^32 + x = unchecked((1664525u * x) + 1013904223u); + + // Take top 16 bits as UTF-16 code unit (includes surrogates). + chars[i] = (char)(x >> 16); + } + + return new string(chars); + } +} diff --git a/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs b/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs index 40491970b3..a4b7084555 100644 --- a/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs +++ b/src/modules/cmdpal/extensionsdk/Microsoft.CommandPalette.Extensions.Toolkit/FuzzyStringMatcher.cs @@ -643,7 +643,7 @@ public static class FuzzyStringMatcher // ============================================================ // Folding: slash normalization + upper case + optional diacritics stripping - private static class Folding + internal static class Folding { // Cache maps an upper case char to its diacritics-stripped upper case char. // '\0' means "not cached yet". @@ -820,6 +820,13 @@ public static class FuzzyStringMatcher return upper; } + // Emoji and other astral symbols come through as surrogate pairs in UTF-16. + // We process char-by-char, so never try to normalize a lone surrogate. + if (char.IsSurrogate(upper)) + { + return upper; + } + var cached = StripCacheUpper[upper]; if (cached != '\0') {