From 7b0b284d40a1f6b938695b4eee3cd0f09e668b22 Mon Sep 17 00:00:00 2001 From: Shawn Yuan <128874481+shuaiyuanxx@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:49:28 +0800 Subject: [PATCH] [Advanced Paste] Introduced image-input handling (#44021) ## Summary of the Pull Request This pull request introduces significant enhancements to the AdvancedPaste module, enabling AI-powered clipboard transformations to support both text and image data (notably for image analysis and transformation tasks), and improving error handling and clipboard tracking. The changes update the service interfaces, data models, and processing logic to handle images alongside text, and refine how the application responds to errors and clipboard state changes. image ## PR Checklist - [ ] Closes: #xxx - [x] **Communication:** I've discussed this with core contributors already. If the work hasn't been agreed, this work might be rejected - [x] **Tests:** Added/updated and all pass - [ ] **Localization:** All end-user-facing strings can be localized - [ ] **Dev docs:** Added/updated - [ ] **New binaries:** Added on the required places - [ ] [JSON for signing](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ESRPSigning_core.json) for new binaries - [ ] [WXS for installer](https://github.com/microsoft/PowerToys/blob/main/installer/PowerToysSetup/Product.wxs) for new binaries and localization folder - [ ] [YML for CI pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ci/templates/build-powertoys-steps.yml) for new test projects - [ ] [YML for signed pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/release.yml) - [ ] **Documentation updated:** If checked, please file a pull request on [our docs repo](https://github.com/MicrosoftDocs/windows-uwp/tree/docs/hub/powertoys) and link it here: #xxx ## Detailed Description of the Pull Request / Additional comments ## Validation Steps Performed --------- Signed-off-by: Shawn Yuan Signed-off-by: Shawn Yuan (from Dev Box) --- .../AIServiceBatchIntegrationTests.cs | 2 +- .../Helpers/DataPackageHelpers.cs | 18 +++++ .../AdvancedPaste/Helpers/NativeMethods.cs | 3 + .../AdvancedPaste/Models/PasteFormats.cs | 6 +- .../CustomActionTransformService.cs | 12 ++-- .../ICustomActionTransformService.cs | 2 +- .../Services/CustomActions/PasteAIRequest.cs | 4 ++ .../SemanticKernelPasteProvider.cs | 43 +++++++---- .../Services/KernelServiceBase.cs | 71 ++++++++++++++++--- .../Services/PasteFormatExecutor.cs | 2 +- .../ViewModels/OptionsViewModel.cs | 9 +++ 11 files changed, 139 insertions(+), 33 deletions(-) diff --git a/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs b/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs index 17b8139bad..1f7829a0bd 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs @@ -144,7 +144,7 @@ public sealed class AIServiceBatchIntegrationTests switch (format) { case PasteFormats.CustomTextTransformation: - var transformResult = await services.CustomActionTransformService.TransformTextAsync(batchTestInput.Prompt, batchTestInput.Clipboard, CancellationToken.None, progress); + var transformResult = await services.CustomActionTransformService.TransformAsync(batchTestInput.Prompt, batchTestInput.Clipboard, null, CancellationToken.None, progress); return DataPackageHelpers.CreateFromText(transformResult.Content ?? string.Empty); case PasteFormats.KernelQuery: diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs index 2cd7554a50..f5439aecf1 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs @@ -225,6 +225,24 @@ internal static class DataPackageHelpers internal static async Task GetHtmlContentAsync(this DataPackageView dataPackageView) => dataPackageView.Contains(StandardDataFormats.Html) ? await dataPackageView.GetHtmlFormatAsync() : string.Empty; + internal static async Task GetImageAsPngBytesAsync(this DataPackageView dataPackageView) + { + var bitmap = await dataPackageView.GetImageContentAsync(); + if (bitmap == null) + { + return null; + } + + using var pngStream = new InMemoryRandomAccessStream(); + var encoder = await BitmapEncoder.CreateAsync(BitmapEncoder.PngEncoderId, pngStream); + encoder.SetSoftwareBitmap(bitmap); + await encoder.FlushAsync(); + + using var memoryStream = new MemoryStream(); + await pngStream.AsStreamForRead().CopyToAsync(memoryStream); + return memoryStream.ToArray(); + } + internal static async Task GetImageContentAsync(this DataPackageView dataPackageView) { using var stream = await dataPackageView.GetImageStreamAsync(); diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs index 6e53e9b618..08293d4be0 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs @@ -166,5 +166,8 @@ namespace AdvancedPaste.Helpers [DllImport("Shlwapi.dll", SetLastError = true, CharSet = CharSet.Unicode)] internal static extern HResult AssocQueryString(AssocF flags, AssocStr str, string pszAssoc, string pszExtra, [Out] StringBuilder pszOut, [In][Out] ref uint pcchOut); + + [DllImport("user32.dll", SetLastError = true)] + internal static extern uint GetClipboardSequenceNumber(); } } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs b/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs index 99243ebb5e..1479912e66 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs @@ -46,7 +46,7 @@ public enum PasteFormats CanPreview = true, SupportedClipboardFormats = ClipboardFormat.Image, IPCKey = AdvancedPasteAdditionalActions.PropertyNames.ImageToText, - KernelFunctionDescription = "Takes an image in the clipboard and extracts all text from it using OCR.")] + KernelFunctionDescription = "Takes an image from the clipboard and extracts text using OCR. This function is intended only for explicit text extraction or OCR requests.")] ImageToText, [PasteFormatMetadata( @@ -118,8 +118,8 @@ public enum PasteFormats IconGlyph = "\uE945", RequiresAIService = true, CanPreview = true, - SupportedClipboardFormats = ClipboardFormat.Text, - KernelFunctionDescription = "Takes input instructions and transforms clipboard text (not TXT files) with these input instructions, putting the result back on the clipboard. This uses AI to accomplish the task.", + SupportedClipboardFormats = ClipboardFormat.Text | ClipboardFormat.Image, + KernelFunctionDescription = "Takes user instructions and applies them to the current clipboard content (text or image). Use this function for image analysis, description, or transformation tasks beyond simple OCR.", RequiresPrompt = true)] CustomTextTransformation, } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs index 57d55492a4..05cdcbe81f 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs @@ -40,15 +40,15 @@ namespace AdvancedPaste.Services.CustomActions this.userSettings = userSettings; } - public async Task TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress progress) + public async Task TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress) { var pasteConfig = userSettings?.PasteAIConfiguration; var providerConfig = BuildProviderConfig(pasteConfig); - return await TransformAsync(prompt, inputText, providerConfig, cancellationToken, progress); + return await TransformAsync(prompt, inputText, imageBytes, providerConfig, cancellationToken, progress); } - private async Task TransformAsync(string prompt, string inputText, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress progress) + private async Task TransformAsync(string prompt, string inputText, byte[] imageBytes, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress progress) { ArgumentNullException.ThrowIfNull(providerConfig); @@ -57,9 +57,9 @@ namespace AdvancedPaste.Services.CustomActions return new CustomActionTransformResult(string.Empty, AIServiceUsage.None); } - if (string.IsNullOrWhiteSpace(inputText)) + if (string.IsNullOrWhiteSpace(inputText) && imageBytes is null) { - Logger.LogWarning("Clipboard has no usable text data"); + Logger.LogWarning("Clipboard has no usable data"); return new CustomActionTransformResult(string.Empty, AIServiceUsage.None); } @@ -80,6 +80,8 @@ namespace AdvancedPaste.Services.CustomActions { Prompt = prompt, InputText = inputText, + ImageBytes = imageBytes, + ImageMimeType = imageBytes != null ? "image/png" : null, SystemPrompt = systemPrompt, }; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs index 1c3ecb980c..564db3fdc5 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs @@ -12,6 +12,6 @@ namespace AdvancedPaste.Services.CustomActions { public interface ICustomActionTransformService { - Task TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress progress); + Task TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress); } } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs index 0e15c93e05..96dabbfa05 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs @@ -12,6 +12,10 @@ namespace AdvancedPaste.Services.CustomActions public string InputText { get; init; } + public byte[] ImageBytes { get; init; } + + public string ImageMimeType { get; init; } + public string SystemPrompt { get; init; } public AIServiceUsage Usage { get; set; } = AIServiceUsage.None; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs index eb2f56e01f..636d2e3e78 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs @@ -64,21 +64,13 @@ namespace AdvancedPaste.Services.CustomActions var prompt = request.Prompt; var inputText = request.InputText; - if (string.IsNullOrWhiteSpace(prompt) || string.IsNullOrWhiteSpace(inputText)) + var imageBytes = request.ImageBytes; + + if (string.IsNullOrWhiteSpace(prompt) || (string.IsNullOrWhiteSpace(inputText) && imageBytes is null)) { - throw new ArgumentException("Prompt and input text must be provided", nameof(request)); + throw new ArgumentException("Prompt and input content must be provided", nameof(request)); } - var userMessageContent = $""" - User instructions: - {prompt} - - Clipboard Content: - {inputText} - - Output: - """; - var executionSettings = CreateExecutionSettings(); var kernel = CreateKernel(); var modelId = _config.Model; @@ -102,7 +94,32 @@ namespace AdvancedPaste.Services.CustomActions var chatHistory = new ChatHistory(); chatHistory.AddSystemMessage(systemPrompt); - chatHistory.AddUserMessage(userMessageContent); + + if (imageBytes != null) + { + var collection = new ChatMessageContentItemCollection(); + if (!string.IsNullOrWhiteSpace(inputText)) + { + collection.Add(new TextContent($"Clipboard Content:\n{inputText}")); + } + + collection.Add(new ImageContent(imageBytes, request.ImageMimeType ?? "image/png")); + collection.Add(new TextContent($"User instructions:\n{prompt}\n\nOutput:")); + chatHistory.AddUserMessage(collection); + } + else + { + var userMessageContent = $""" + User instructions: + {prompt} + + Clipboard Content: + {inputText} + + Output: + """; + chatHistory.AddUserMessage(userMessageContent); + } var response = await chatService.GetChatMessageContentAsync(chatHistory, executionSettings, kernel, cancellationToken); chatHistory.Add(response); diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs index 47e208eb49..0d753d1ec3 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs @@ -67,12 +67,36 @@ public abstract class KernelServiceBase( LogResult(cacheUsed, isSavedQuery, kernel.GetOrAddActionChain(), usage); + var outputPackage = kernel.GetDataPackage(); + var hasUsableData = await outputPackage.GetView().HasUsableDataAsync(); + if (kernel.GetLastError() is Exception ex) { - throw ex; + // If we have an error, but the AI provided a final text response, we can ignore the error (likely a tool failure that the AI handled). + // However, if we have usable data (e.g. from a successful tool call before the error?), we might want to keep it? + // In the case of ImageToText failure, outputPackage is empty (new DataPackage), hasUsableData is false. + // So we check if there is a valid response in the chat history. + var lastMessage = chatHistory.LastOrDefault(); + bool hasAssistantResponse = lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content); + + if (!hasAssistantResponse && !hasUsableData) + { + throw ex; + } + + // If we have a response or data, we log the error but proceed. + Logger.LogWarning($"Kernel operation encountered an error but proceeded with available response/data: {ex.Message}"); } - var outputPackage = kernel.GetDataPackage(); + if (!hasUsableData) + { + var lastMessage = chatHistory.LastOrDefault(); + if (lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content)) + { + outputPackage = DataPackageHelpers.CreateFromText(lastMessage.Content); + kernel.SetDataPackage(outputPackage); + } + } if (!(await outputPackage.GetView().HasUsableDataAsync())) { @@ -148,7 +172,21 @@ public abstract class KernelServiceBase( var systemPrompt = string.IsNullOrWhiteSpace(runtimeConfig.SystemPrompt) ? DefaultSystemPrompt : runtimeConfig.SystemPrompt; chatHistory.AddSystemMessage(systemPrompt); chatHistory.AddSystemMessage($"Available clipboard formats: {await kernel.GetDataFormatsAsync()}"); - chatHistory.AddUserMessage(prompt); + + var imageBytes = await kernel.GetDataPackageView().GetImageAsPngBytesAsync(); + if (imageBytes != null) + { + var collection = new ChatMessageContentItemCollection + { + new TextContent(prompt), + new ImageContent(imageBytes, "image/png"), + }; + chatHistory.AddUserMessage(collection); + } + else + { + chatHistory.AddUserMessage(prompt); + } if (ShouldModerateAdvancedAI()) { @@ -302,8 +340,16 @@ public abstract class KernelServiceBase( new ActionChainItem(PasteFormats.CustomTextTransformation, Arguments: new() { { PromptParameterName, fixedPrompt } }), async dataPackageView => { - var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); - var result = await _customActionTransformService.TransformTextAsync(fixedPrompt, input, kernel.GetCancellationToken(), kernel.GetProgress()); + var imageBytes = await dataPackageView.GetImageAsPngBytesAsync(); + var input = await dataPackageView.GetTextOrHtmlTextAsync(); + + if (string.IsNullOrEmpty(input) && imageBytes == null) + { + // If we have no text and no image, try to get text via OCR or throw if nothing exists + input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); + } + + var result = await _customActionTransformService.TransformAsync(fixedPrompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress()); return DataPackageHelpers.CreateFromText(result?.Content ?? string.Empty); }); @@ -313,15 +359,22 @@ public abstract class KernelServiceBase( new ActionChainItem(format, Arguments: new() { { PromptParameterName, prompt } }), async dataPackageView => { - var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); - string output = await GetPromptBasedOutput(format, prompt, input, kernel.GetCancellationToken(), kernel.GetProgress()); + var imageBytes = await dataPackageView.GetImageAsPngBytesAsync(); + var input = await dataPackageView.GetTextOrHtmlTextAsync(); + + if (string.IsNullOrEmpty(input) && imageBytes == null) + { + input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); + } + + string output = await GetPromptBasedOutput(format, prompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress()); return DataPackageHelpers.CreateFromText(output); }); - private async Task GetPromptBasedOutput(PasteFormats format, string prompt, string input, CancellationToken cancellationToken, IProgress progress) => + private async Task GetPromptBasedOutput(PasteFormats format, string prompt, string input, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress) => format switch { - PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformTextAsync(prompt, input, cancellationToken, progress))?.Content ?? string.Empty, + PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformAsync(prompt, input, imageBytes, cancellationToken, progress))?.Content ?? string.Empty, _ => throw new ArgumentException($"Unsupported format {format} for prompt transform", nameof(format)), }; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs index aef9e39bb9..ff64a5ad83 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs @@ -37,7 +37,7 @@ public sealed class PasteFormatExecutor(IKernelService kernelService, ICustomAct pasteFormat.Format switch { PasteFormats.KernelQuery => await _kernelService.TransformClipboardAsync(pasteFormat.Prompt, clipboardData, pasteFormat.IsSavedQuery, cancellationToken, progress), - PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformTextAsync(pasteFormat.Prompt, await clipboardData.GetClipboardTextOrThrowAsync(cancellationToken), cancellationToken, progress))?.Content ?? string.Empty), + PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformAsync(pasteFormat.Prompt, await clipboardData.GetTextOrHtmlTextAsync(), await clipboardData.GetImageAsPngBytesAsync(), cancellationToken, progress))?.Content ?? string.Empty), _ => await TransformHelpers.TransformAsync(format, clipboardData, cancellationToken, progress), }); } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs b/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs index 8edd9b76ad..b055d46457 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs @@ -45,6 +45,7 @@ namespace AdvancedPaste.ViewModels private CancellationTokenSource _pasteActionCancellationTokenSource; private string _currentClipboardHistoryId; + private uint _lastClipboardSequenceNumber; private DateTimeOffset? _currentClipboardTimestamp; private ClipboardFormat _lastClipboardFormats = ClipboardFormat.None; private bool _clipboardHistoryUnavailableLogged; @@ -455,6 +456,7 @@ namespace AdvancedPaste.ViewModels { ResetClipboardPreview(); _currentClipboardHistoryId = null; + _lastClipboardSequenceNumber = 0; _currentClipboardTimestamp = null; _lastClipboardFormats = ClipboardFormat.None; return; @@ -477,6 +479,13 @@ namespace AdvancedPaste.ViewModels { bool clipboardChanged = formatsChanged; + var currentSequenceNumber = NativeMethods.GetClipboardSequenceNumber(); + if (_lastClipboardSequenceNumber != currentSequenceNumber) + { + clipboardChanged = true; + _lastClipboardSequenceNumber = currentSequenceNumber; + } + if (Clipboard.IsHistoryEnabled()) { try