diff --git a/.github/actions/spell-check/allow/code.txt b/.github/actions/spell-check/allow/code.txt index a7d02dcb21..3e7341d5c3 100644 --- a/.github/actions/spell-check/allow/code.txt +++ b/.github/actions/spell-check/allow/code.txt @@ -330,6 +330,9 @@ HHH riday YYY +# Unicode +precomposed + # GitHub issue/PR commands azp feedbackhub diff --git a/src/modules/powerrename/lib/PowerRenameRegEx.cpp b/src/modules/powerrename/lib/PowerRenameRegEx.cpp index 567df48606..aabf838a7a 100644 --- a/src/modules/powerrename/lib/PowerRenameRegEx.cpp +++ b/src/modules/powerrename/lib/PowerRenameRegEx.cpp @@ -11,6 +11,48 @@ using std::conditional_t; using std::regex_error; +/// +/// Sanitizes the input string by replacing non-breaking spaces with regular spaces and +/// normalizes it to Unicode NFC (precomposed) form. +/// +/// The input wide string to sanitize and normalize. If empty, it is +/// returned unchanged. +/// A new std::wstring containing the sanitized and NFC-normalized form of the +/// input. If normalization fails, the function returns the sanitized string (with non- +/// breaking spaces replaced) as-is. +static std::wstring SanitizeAndNormalize(const std::wstring& input) +{ + if (input.empty()) + { + return input; + } + + std::wstring sanitized = input; + // Replace non-breaking spaces (0xA0) with regular spaces (0x20). + std::replace(sanitized.begin(), sanitized.end(), L'\u00A0', L' '); + + // Normalize to NFC (Precomposed). + // Get the size needed for the normalized string, including null terminator. + int size = NormalizeString(NormalizationC, sanitized.c_str(), -1, nullptr, 0); + if (size <= 0) + { + return sanitized; // Return unaltered if normalization fails. + } + + // Perform the normalization. + std::wstring normalized; + normalized.resize(size); + NormalizeString(NormalizationC, sanitized.c_str(), -1, &normalized[0], size); + + // Remove the explicit null terminator added by NormalizeString. + if (!normalized.empty() && normalized.back() == L'\0') + { + normalized.pop_back(); + } + + return normalized; +} + IFACEMETHODIMP_(ULONG) CPowerRenameRegEx::AddRef() { @@ -94,18 +136,20 @@ IFACEMETHODIMP CPowerRenameRegEx::PutSearchTerm(_In_ PCWSTR searchTerm, bool for HRESULT hr = S_OK; if (searchTerm) { + std::wstring normalizedSearchTerm = SanitizeAndNormalize(searchTerm); + CSRWExclusiveAutoLock lock(&m_lock); - if (m_searchTerm == nullptr || lstrcmp(searchTerm, m_searchTerm) != 0) + if (m_searchTerm == nullptr || lstrcmp(normalizedSearchTerm.c_str(), m_searchTerm) != 0) { changed = true; CoTaskMemFree(m_searchTerm); - if (lstrcmp(searchTerm, L"") == 0) + if (normalizedSearchTerm.empty()) { m_searchTerm = NULL; } else { - hr = SHStrDup(searchTerm, &m_searchTerm); + hr = SHStrDup(normalizedSearchTerm.c_str(), &m_searchTerm); } } } @@ -238,17 +282,19 @@ IFACEMETHODIMP CPowerRenameRegEx::PutReplaceTerm(_In_ PCWSTR replaceTerm, bool f HRESULT hr = S_OK; if (replaceTerm) { + std::wstring normalizedReplaceTerm = SanitizeAndNormalize(replaceTerm); + CSRWExclusiveAutoLock lock(&m_lock); - if (m_replaceTerm == nullptr || lstrcmp(replaceTerm, m_RawReplaceTerm.c_str()) != 0) + if (m_replaceTerm == nullptr || lstrcmp(normalizedReplaceTerm.c_str(), m_RawReplaceTerm.c_str()) != 0) { changed = true; CoTaskMemFree(m_replaceTerm); - m_RawReplaceTerm = replaceTerm; + m_RawReplaceTerm = normalizedReplaceTerm; if ((m_flags & RandomizeItems) || (m_flags & EnumerateItems)) hr = _OnEnumerateOrRandomizeItemsChanged(); else - hr = SHStrDup(replaceTerm, &m_replaceTerm); + hr = SHStrDup(normalizedReplaceTerm.c_str(), &m_replaceTerm); } } @@ -397,7 +443,10 @@ HRESULT CPowerRenameRegEx::Replace(_In_ PCWSTR source, _Outptr_ PWSTR* result, u { return hr; } - std::wstring res = source; + + std::wstring normalizedSource = SanitizeAndNormalize(source); + + std::wstring res = normalizedSource; try { // TODO: creating the regex could be costly. May want to cache this. @@ -438,9 +487,8 @@ HRESULT CPowerRenameRegEx::Replace(_In_ PCWSTR source, _Outptr_ PWSTR* result, u } } - std::wstring sourceToUse; + std::wstring sourceToUse = normalizedSource; sourceToUse.reserve(MAX_PATH); - sourceToUse = source; std::wstring searchTerm(m_searchTerm); std::wstring replaceTerm; @@ -536,7 +584,7 @@ HRESULT CPowerRenameRegEx::Replace(_In_ PCWSTR source, _Outptr_ PWSTR* result, u replaceTerm = regex_replace(replaceTerm, zeroGroupRegex, L"$1$$$0"); replaceTerm = regex_replace(replaceTerm, otherGroupsRegex, L"$1$0$4"); - res = RegexReplaceDispatch[_useBoostLib](source, m_searchTerm, replaceTerm, m_flags & MatchAllOccurrences, isCaseInsensitive); + res = RegexReplaceDispatch[_useBoostLib](sourceToUse, m_searchTerm, replaceTerm, m_flags & MatchAllOccurrences, isCaseInsensitive); // Use regex search to determine if a match exists. This is the basis for incrementing // the counter. @@ -669,17 +717,17 @@ PowerRenameLib::MetadataType CPowerRenameRegEx::_GetMetadataTypeFromFlags() cons { if (m_flags & MetadataSourceXMP) return PowerRenameLib::MetadataType::XMP; - + // Default to EXIF return PowerRenameLib::MetadataType::EXIF; } -// Interface method implementation +// Interface method implementation IFACEMETHODIMP CPowerRenameRegEx::GetMetadataType(_Out_ PowerRenameLib::MetadataType* metadataType) { if (metadataType == nullptr) return E_POINTER; - + *metadataType = _GetMetadataTypeFromFlags(); return S_OK; } @@ -689,5 +737,3 @@ PowerRenameLib::MetadataType CPowerRenameRegEx::GetMetadataType() const { return _GetMetadataTypeFromFlags(); } - - diff --git a/src/modules/powerrename/unittests/CommonRegExTests.h b/src/modules/powerrename/unittests/CommonRegExTests.h index 1b0ad30b92..4dc078e9b1 100644 --- a/src/modules/powerrename/unittests/CommonRegExTests.h +++ b/src/modules/powerrename/unittests/CommonRegExTests.h @@ -647,6 +647,54 @@ TEST_METHOD(VerifyCounterIncrementsWhenResultIsUnchanged) CoTaskMemFree(result); } +// Helper function to verify normalization behavior. +void VerifyNormalizationHelper(DWORD flags) +{ + CComPtr renameRegEx; + Assert::IsTrue(CPowerRenameRegEx::s_CreateInstance(&renameRegEx) == S_OK); + Assert::IsTrue(renameRegEx->PutFlags(flags) == S_OK); + + // 1. Unicode Normalization: NFD source with NFC search term. + PWSTR result = nullptr; + unsigned long index = 0; + + // Source: "Test" + U+0438 (Cyrillic small letter i) + U+0306 (combining breve). + std::wstring sourceNFD = L"Test\u0438\u0306"; + // Search: "Test" + U+0438 (Cyrillic small letter i with breve). + std::wstring searchNFC = L"Test\u0439"; + + // A match should occur despite different normalization forms. + Assert::IsTrue(renameRegEx->PutSearchTerm(searchNFC.c_str()) == S_OK); + Assert::IsTrue(renameRegEx->PutReplaceTerm(L"Match") == S_OK); + Assert::IsTrue(renameRegEx->Replace(sourceNFD.c_str(), &result, index) == S_OK); + Assert::AreEqual(L"Match", result, L"Failed to match NFD source with NFC search term."); + CoTaskMemFree(result); + + // 2. Whitespace Normalization: test non-breaking space versus regular space. + result = nullptr; + index = 0; + + // Source: "Hello" + non-breaking space + "World". + std::wstring sourceNBSP = L"Hello\u00A0World"; + // Search: "Hello" + regular space + "World". + std::wstring searchSpace = L"Hello World"; + + Assert::IsTrue(renameRegEx->PutSearchTerm(searchSpace.c_str()) == S_OK); + Assert::IsTrue(renameRegEx->Replace(sourceNBSP.c_str(), &result, index) == S_OK); + Assert::AreEqual(L"Match", result, L"Failed to match non-breaking space source with regular space search term."); + CoTaskMemFree(result); +} + +TEST_METHOD(VerifyUnicodeAndWhitespaceNormalizationSimpleSearch) +{ + VerifyNormalizationHelper(0); +} + +TEST_METHOD(VerifyUnicodeAndWhitespaceNormalizationRegex) +{ + VerifyNormalizationHelper(UseRegularExpressions); +} + #ifndef TESTS_PARTIAL }; }