CmdPal: Improve fuzzy matcher Unicode and emoji robustness (#45275)

## Summary of the Pull Request

Add comprehensive unit tests for emoji, ZWJ sequences, skin tone
modifiers, and UTF-16 edge cases (unpaired surrogates, combining marks,
random garbage). Update matcher logic to skip normalization of lone
surrogates, preventing errors with malformed Unicode. Expand comparison
test data to cover emoji scenarios. Adds regression guards for diacritic
handling and surrogate processing.

Fixes #45246 introduced in #44809.

<!-- Please review the items on the PR checklist before submitting-->
## PR Checklist

- [x] Closes: #45246
<!-- - [ ] Closes: #yyy (add separate lines for additional resolved
issues) -->
- [ ] **Communication:** I've discussed this with core contributors
already. If the work hasn't been agreed, this work might be rejected
- [ ] **Tests:** Added/updated and all pass
- [ ] **Localization:** All end-user-facing strings can be localized
- [ ] **Dev docs:** Added/updated
- [ ] **New binaries:** Added on the required places
- [ ] [JSON for
signing](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ESRPSigning_core.json)
for new binaries
- [ ] [WXS for
installer](https://github.com/microsoft/PowerToys/blob/main/installer/PowerToysSetup/Product.wxs)
for new binaries and localization folder
- [ ] [YML for CI
pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ci/templates/build-powertoys-steps.yml)
for new test projects
- [ ] [YML for signed
pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/release.yml)
- [ ] **Documentation updated:** If checked, please file a pull request
on [our docs
repo](https://github.com/MicrosoftDocs/windows-uwp/tree/docs/hub/powertoys)
and link it here: #xxx

<!-- Provide a more detailed description of the PR, other things fixed,
or any additional comments/features here -->
## Detailed Description of the Pull Request / Additional comments

<!-- Describe how you validated the behavior. Add automated tests
wherever possible, but list manual validation steps taken as well -->
## Validation Steps Performed
This commit is contained in:
Jiří Polášek
2026-02-02 19:30:00 +01:00
committed by GitHub
parent 18c6d6b0f3
commit 49cc504d94
7 changed files with 427 additions and 3 deletions

View File

@@ -117,7 +117,11 @@ public class FuzzyMatcherComparisonTests
["_a", "_a"],
["a_", "a_"],
["-a", "-a"],
["a-", "a-"]
["a-", "a-"],
["🐿️", "🐿️"], // Squirrel emoji
["\U0001F44D", "\U0001F44D\U0001F3FB"], // Base thumbs-up vs thumbs-up with LIGHT skin tone modifier
["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FB"], // Thumbs-up with LIGHT skin tone vs itself (exact same sequence)
["\U0001F44D\U0001F3FB", "\U0001F44D\U0001F3FF"], // Thumbs-up with LIGHT skin tone vs thumbs-up with DARK skin tone
];
[TestMethod]

View File

@@ -0,0 +1,29 @@
// Copyright (c) Microsoft Corporation
// The Microsoft Corporation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests;
[TestClass]
public sealed class FuzzyMatcherComplexEmojiTests
{
[TestMethod]
[Ignore("For now this is not supported")]
public void Mismatch_DifferentSkinTone_PartialMatch()
{
// "👍🏻" (Light) vs "👍🏿" (Dark)
// They share the base "👍".
const string needle = "👍🏻";
const string haystack = "👍🏿";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
// Should have a positive score because of the base emoji match
Assert.IsTrue(result.Score > 0, "Expected partial match based on base emoji");
// Should match the base emoji (2 chars)
Assert.AreEqual(2, result.Positions.Count, "Expected match on base emoji only");
}
}

View File

@@ -0,0 +1,83 @@
// Copyright (c) Microsoft Corporation
// The Microsoft Corporation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests;
[TestClass]
public sealed class FuzzyMatcherEmojiTests
{
[TestMethod]
public void ExactMatch_SimpleEmoji_ReturnsScore()
{
const string needle = "🚀";
const string haystack = "Launch 🚀 sequence";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
Assert.IsTrue(result.Score > 0, "Expected match for simple emoji");
// 🚀 is 2 chars (surrogates)
Assert.AreEqual(2, result.Positions.Count, "Expected 2 matched characters positions for the emoji");
}
[TestMethod]
public void ExactMatch_SkinTone_ReturnsScore()
{
const string needle = "👍🏽"; // Medium skin tone
const string haystack = "Thumbs up 👍🏽 here";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
Assert.IsTrue(result.Score > 0, "Expected match for emoji with skin tone");
// 👍🏽 is 4 chars: U+1F44D (2 chars) + U+1F3FD (2 chars)
Assert.AreEqual(4, result.Positions.Count, "Expected 4 matched characters positions for the emoji with modifier");
}
[TestMethod]
public void ZWJSequence_Family_Match()
{
const string needle = "👨‍👩‍👧‍👦"; // Family: Man, Woman, Girl, Boy
const string haystack = "Emoji 👨‍👩‍👧‍👦 Test";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
Assert.IsTrue(result.Score > 0, "Expected match for ZWJ sequence");
// This emoji is 11 code points? No.
// Man (2) + ZWJ (1) + Woman (2) + ZWJ (1) + Girl (2) + ZWJ (1) + Boy (2) = 11 chars?
// Let's just check score > 0.
Assert.IsTrue(result.Positions.Count > 0);
}
[TestMethod]
public void Flags_Match()
{
const string needle = "🇺🇸"; // US Flag (Regional Indicator U + Regional Indicator S)
const string haystack = "USA 🇺🇸";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
Assert.IsTrue(result.Score > 0, "Expected match for flag emoji");
// 2 code points, each is surrogate pair?
// U+1F1FA (REGIONAL INDICATOR SYMBOL LETTER U) -> 2 chars
// U+1F1F8 (REGIONAL INDICATOR SYMBOL LETTER S) -> 2 chars
// Total 4 chars.
Assert.AreEqual(4, result.Positions.Count);
}
[TestMethod]
public void Emoji_MixedWithText_Search()
{
const string needle = "t🌮o"; // "t" + taco + "o"
const string haystack = "taco 🌮 on tuesday";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(needle, haystack, allowNonContiguousMatches: true);
Assert.IsTrue(result.Score > 0);
}
}

View File

@@ -0,0 +1,78 @@
// Copyright (c) Microsoft Corporation
// The Microsoft Corporation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests;
[TestClass]
public sealed class FuzzyMatcherNormalizationTests
{
[TestMethod]
public void Normalization_ShouldBeLengthPreserving_GermanEszett()
{
// "Straße" (6 chars)
// Standard "SS" expansion would change length to 7.
// Our normalizer must preserve length.
var input = "Straße";
var expectedLength = input.Length;
// Case 1: Remove Diacritics = true
var normalized = Fold(input, removeDiacritics: true);
Assert.AreEqual(expectedLength, normalized.Length, "Normalization (removeDiacritics=true) must be length preserving for 'Straße'");
// Verify expected mapping: ß -> ß (length 1)
Assert.AreEqual("STRAßE", normalized);
// Case 2: Remove Diacritics = false
var normalizedKeep = Fold(input, removeDiacritics: false);
Assert.AreEqual(expectedLength, normalizedKeep.Length, "Normalization (removeDiacritics=false) must be length preserving for 'Straße'");
// ß maps to ß in invariant culture (length 1)
Assert.AreEqual("STRAßE", normalizedKeep);
}
[TestMethod]
public void Normalization_ShouldBeLengthPreserving_CommonDiacritics()
{
var input = "Crème Brûlée";
var expected = "CREME BRULEE";
var normalized = Fold(input, removeDiacritics: true);
Assert.AreEqual(input.Length, normalized.Length);
Assert.AreEqual(expected, normalized);
}
[TestMethod]
public void Normalization_ShouldBeLengthPreserving_MixedComposed()
{
// "Ångström" -> A + ring, o + umlaut /* #no-spell-check-line */
var input = "Ångström"; /* #no-spell-check-line */
var expected = "ANGSTROM";
var normalized = Fold(input, removeDiacritics: true);
Assert.AreEqual(input.Length, normalized.Length);
Assert.AreEqual(expected, normalized);
}
[TestMethod]
public void Normalization_ShouldNormalizeSlashes()
{
var input = @"Folder\File.txt";
var expected = "FOLDER/FILE.TXT";
var normalized = Fold(input, removeDiacritics: true);
Assert.AreEqual(input.Length, normalized.Length);
Assert.AreEqual(expected, normalized);
}
private string Fold(string input, bool removeDiacritics)
{
return FuzzyStringMatcher.Folding.FoldForComparison(input, removeDiacritics);
}
}

View File

@@ -0,0 +1,223 @@
// Copyright (c) Microsoft Corporation
// The Microsoft Corporation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace Microsoft.CommandPalette.Extensions.Toolkit.UnitTests;
[TestClass]
public sealed class FuzzyMatcherUnicodeGarbageTests
{
[TestMethod]
public void UnpairedHighSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow()
{
const string needle = "\uD83D"; // high surrogate (unpaired)
const string haystack = "abc";
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void UnpairedLowSurrogateInNeedle_RemoveDiacritics_ShouldNotThrow()
{
const string needle = "\uDC00"; // low surrogate (unpaired)
const string haystack = "abc";
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void UnpairedHighSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow()
{
const string needle = "a";
const string haystack = "a\uD83D" + "bc"; // inject unpaired high surrogate
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void UnpairedLowSurrogateInHaystack_RemoveDiacritics_ShouldNotThrow()
{
const string needle = "a";
const string haystack = "a\uDC00" + "bc"; // inject unpaired low surrogate
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void MixedSurrogatesAndMarks_RemoveDiacritics_ShouldNotThrow()
{
// "Garbage smoothie": unpaired surrogate + combining mark + emoji surrogate pair
const string needle = "a\uD83D\u0301"; // 'a' + unpaired high surrogate + combining acute
const string haystack = "a\u0301 \U0001F600"; // 'a' + combining acute + space + 😀 (valid pair)
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void ValidEmojiSurrogatePair_RemoveDiacritics_ShouldNotThrow_AndCanMatch()
{
// 😀 U+1F600 encoded as surrogate pair in UTF-16
const string needle = "\U0001F600";
const string haystack = "x \U0001F600 y";
var result = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
// Keep assertions minimal: just ensure it doesn't act like "no match".
// If your API returns score=0 for no match, this is stable.
Assert.IsTrue(result.Score > 0, "Expected emoji to produce a match score > 0.");
Assert.IsTrue(result.Positions.Count > 0, "Expected at least one matched position.");
}
[TestMethod]
public void DiacriticStripping_StillWorks_OnBMPNonSurrogate()
{
// This is a regression guard: we fixed surrogates; don't break diacritic stripping.
// "é" should fold like "e" when removeDiacritics=true.
const string needle = "cafe";
const string haystack = "CAFÉ";
var withDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
var withoutDiacriticsRemoved = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: false);
Assert.IsTrue(withDiacriticsRemoved.Score >= withoutDiacriticsRemoved.Score, "Removing diacritics should not make matching worse for 'CAFÉ' vs 'cafe'.");
Assert.IsTrue(withDiacriticsRemoved.Score > 0, "Expected a match when diacritics are removed.");
}
[TestMethod]
public void RandomUtf16Garbage_RemoveDiacritics_ShouldNotThrow()
{
// Deterministic pseudo-random "UTF-16 garbage", including surrogates.
// This is a quick fuzz-lite test thats stable across runs.
var s1 = MakeDeterministicGarbage(seed: 1234, length: 512);
var s2 = MakeDeterministicGarbage(seed: 5678, length: 1024);
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
s1,
s2,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void RandomUtf16Garbage_NoDiacritics_ShouldNotThrow()
{
var s1 = MakeDeterministicGarbage(seed: 42, length: 512);
var s2 = MakeDeterministicGarbage(seed: 43, length: 1024);
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
s1,
s2,
allowNonContiguousMatches: true,
removeDiacritics: false);
}
[TestMethod]
public void HighSurrogateAtEndOfHaystack_RemoveDiacritics_ShouldNotThrow()
{
const string needle = "a";
const string haystack = "abc\uD83D"; // Ends with high surrogate
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void ComplexEmojiSequence_RemoveDiacritics_ShouldNotThrow()
{
// Family: Man, Woman, Girl, Boy
// U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466
const string needle = "\U0001F468";
const string haystack = "Info: \U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466 family";
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
[TestMethod]
public void NullOrEmptyInputs_ShouldNotThrow()
{
// Empty needle
var result1 = FuzzyStringMatcher.ScoreFuzzyWithPositions(string.Empty, "abc", true, true);
Assert.AreEqual(0, result1.Score);
// Empty haystack
var result2 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", string.Empty, true, true);
Assert.AreEqual(0, result2.Score);
// Null haystack
var result3 = FuzzyStringMatcher.ScoreFuzzyWithPositions("abc", null!, true, true);
Assert.AreEqual(0, result3.Score);
}
[TestMethod]
public void VeryLongStrings_ShouldNotThrow()
{
var needle = new string('a', 100);
var haystack = new string('b', 10000) + needle + new string('c', 10000);
_ = FuzzyStringMatcher.ScoreFuzzyWithPositions(
needle,
haystack,
allowNonContiguousMatches: true,
removeDiacritics: true);
}
private static string MakeDeterministicGarbage(int seed, int length)
{
// LCG for deterministic generation without Randoms platform/version surprises.
var x = (uint)seed;
var chars = length <= 2048 ? stackalloc char[length] : new char[length];
for (var i = 0; i < chars.Length; i++)
{
// LCG: x = (a*x + c) mod 2^32
x = unchecked((1664525u * x) + 1013904223u);
// Take top 16 bits as UTF-16 code unit (includes surrogates).
chars[i] = (char)(x >> 16);
}
return new string(chars);
}
}