mirror of
https://github.com/microsoft/PowerToys.git
synced 2026-04-03 17:56:44 +02:00
[Advanced Paste] Introduced image-input handling (#44021)
<!-- Enter a brief description/summary of your PR here. What does it fix/what does it change/how was it tested (even manually, if necessary)? --> ## Summary of the Pull Request This pull request introduces significant enhancements to the AdvancedPaste module, enabling AI-powered clipboard transformations to support both text and image data (notably for image analysis and transformation tasks), and improving error handling and clipboard tracking. The changes update the service interfaces, data models, and processing logic to handle images alongside text, and refine how the application responds to errors and clipboard state changes. <img width="470" height="366" alt="image" src="https://github.com/user-attachments/assets/6ad011e4-a2ba-4e44-b640-739440836de6" /> <!-- Please review the items on the PR checklist before submitting--> ## PR Checklist - [ ] Closes: #xxx <!-- - [ ] Closes: #yyy (add separate lines for additional resolved issues) --> - [x] **Communication:** I've discussed this with core contributors already. If the work hasn't been agreed, this work might be rejected - [x] **Tests:** Added/updated and all pass - [ ] **Localization:** All end-user-facing strings can be localized - [ ] **Dev docs:** Added/updated - [ ] **New binaries:** Added on the required places - [ ] [JSON for signing](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ESRPSigning_core.json) for new binaries - [ ] [WXS for installer](https://github.com/microsoft/PowerToys/blob/main/installer/PowerToysSetup/Product.wxs) for new binaries and localization folder - [ ] [YML for CI pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/ci/templates/build-powertoys-steps.yml) for new test projects - [ ] [YML for signed pipeline](https://github.com/microsoft/PowerToys/blob/main/.pipelines/release.yml) - [ ] **Documentation updated:** If checked, please file a pull request on [our docs repo](https://github.com/MicrosoftDocs/windows-uwp/tree/docs/hub/powertoys) and link it here: #xxx <!-- Provide a more detailed description of the PR, other things fixed, or any additional comments/features here --> ## Detailed Description of the Pull Request / Additional comments <!-- Describe how you validated the behavior. Add automated tests wherever possible, but list manual validation steps taken as well --> ## Validation Steps Performed --------- Signed-off-by: Shawn Yuan <shuaiyuan@microsoft.com> Signed-off-by: Shawn Yuan (from Dev Box) <shuaiyuan@microsoft.com>
This commit is contained in:
@@ -144,7 +144,7 @@ public sealed class AIServiceBatchIntegrationTests
|
||||
switch (format)
|
||||
{
|
||||
case PasteFormats.CustomTextTransformation:
|
||||
var transformResult = await services.CustomActionTransformService.TransformTextAsync(batchTestInput.Prompt, batchTestInput.Clipboard, CancellationToken.None, progress);
|
||||
var transformResult = await services.CustomActionTransformService.TransformAsync(batchTestInput.Prompt, batchTestInput.Clipboard, null, CancellationToken.None, progress);
|
||||
return DataPackageHelpers.CreateFromText(transformResult.Content ?? string.Empty);
|
||||
|
||||
case PasteFormats.KernelQuery:
|
||||
|
||||
@@ -225,6 +225,24 @@ internal static class DataPackageHelpers
|
||||
internal static async Task<string> GetHtmlContentAsync(this DataPackageView dataPackageView) =>
|
||||
dataPackageView.Contains(StandardDataFormats.Html) ? await dataPackageView.GetHtmlFormatAsync() : string.Empty;
|
||||
|
||||
internal static async Task<byte[]> GetImageAsPngBytesAsync(this DataPackageView dataPackageView)
|
||||
{
|
||||
var bitmap = await dataPackageView.GetImageContentAsync();
|
||||
if (bitmap == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
using var pngStream = new InMemoryRandomAccessStream();
|
||||
var encoder = await BitmapEncoder.CreateAsync(BitmapEncoder.PngEncoderId, pngStream);
|
||||
encoder.SetSoftwareBitmap(bitmap);
|
||||
await encoder.FlushAsync();
|
||||
|
||||
using var memoryStream = new MemoryStream();
|
||||
await pngStream.AsStreamForRead().CopyToAsync(memoryStream);
|
||||
return memoryStream.ToArray();
|
||||
}
|
||||
|
||||
internal static async Task<SoftwareBitmap> GetImageContentAsync(this DataPackageView dataPackageView)
|
||||
{
|
||||
using var stream = await dataPackageView.GetImageStreamAsync();
|
||||
|
||||
@@ -166,5 +166,8 @@ namespace AdvancedPaste.Helpers
|
||||
|
||||
[DllImport("Shlwapi.dll", SetLastError = true, CharSet = CharSet.Unicode)]
|
||||
internal static extern HResult AssocQueryString(AssocF flags, AssocStr str, string pszAssoc, string pszExtra, [Out] StringBuilder pszOut, [In][Out] ref uint pcchOut);
|
||||
|
||||
[DllImport("user32.dll", SetLastError = true)]
|
||||
internal static extern uint GetClipboardSequenceNumber();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ public enum PasteFormats
|
||||
CanPreview = true,
|
||||
SupportedClipboardFormats = ClipboardFormat.Image,
|
||||
IPCKey = AdvancedPasteAdditionalActions.PropertyNames.ImageToText,
|
||||
KernelFunctionDescription = "Takes an image in the clipboard and extracts all text from it using OCR.")]
|
||||
KernelFunctionDescription = "Takes an image from the clipboard and extracts text using OCR. This function is intended only for explicit text extraction or OCR requests.")]
|
||||
ImageToText,
|
||||
|
||||
[PasteFormatMetadata(
|
||||
@@ -118,8 +118,8 @@ public enum PasteFormats
|
||||
IconGlyph = "\uE945",
|
||||
RequiresAIService = true,
|
||||
CanPreview = true,
|
||||
SupportedClipboardFormats = ClipboardFormat.Text,
|
||||
KernelFunctionDescription = "Takes input instructions and transforms clipboard text (not TXT files) with these input instructions, putting the result back on the clipboard. This uses AI to accomplish the task.",
|
||||
SupportedClipboardFormats = ClipboardFormat.Text | ClipboardFormat.Image,
|
||||
KernelFunctionDescription = "Takes user instructions and applies them to the current clipboard content (text or image). Use this function for image analysis, description, or transformation tasks beyond simple OCR.",
|
||||
RequiresPrompt = true)]
|
||||
CustomTextTransformation,
|
||||
}
|
||||
|
||||
@@ -40,15 +40,15 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
this.userSettings = userSettings;
|
||||
}
|
||||
|
||||
public async Task<CustomActionTransformResult> TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress<double> progress)
|
||||
public async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress)
|
||||
{
|
||||
var pasteConfig = userSettings?.PasteAIConfiguration;
|
||||
var providerConfig = BuildProviderConfig(pasteConfig);
|
||||
|
||||
return await TransformAsync(prompt, inputText, providerConfig, cancellationToken, progress);
|
||||
return await TransformAsync(prompt, inputText, imageBytes, providerConfig, cancellationToken, progress);
|
||||
}
|
||||
|
||||
private async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress<double> progress)
|
||||
private async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress<double> progress)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(providerConfig);
|
||||
|
||||
@@ -57,9 +57,9 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
return new CustomActionTransformResult(string.Empty, AIServiceUsage.None);
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(inputText))
|
||||
if (string.IsNullOrWhiteSpace(inputText) && imageBytes is null)
|
||||
{
|
||||
Logger.LogWarning("Clipboard has no usable text data");
|
||||
Logger.LogWarning("Clipboard has no usable data");
|
||||
return new CustomActionTransformResult(string.Empty, AIServiceUsage.None);
|
||||
}
|
||||
|
||||
@@ -80,6 +80,8 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
{
|
||||
Prompt = prompt,
|
||||
InputText = inputText,
|
||||
ImageBytes = imageBytes,
|
||||
ImageMimeType = imageBytes != null ? "image/png" : null,
|
||||
SystemPrompt = systemPrompt,
|
||||
};
|
||||
|
||||
|
||||
@@ -12,6 +12,6 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
{
|
||||
public interface ICustomActionTransformService
|
||||
{
|
||||
Task<CustomActionTransformResult> TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress<double> progress);
|
||||
Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,10 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
|
||||
public string InputText { get; init; }
|
||||
|
||||
public byte[] ImageBytes { get; init; }
|
||||
|
||||
public string ImageMimeType { get; init; }
|
||||
|
||||
public string SystemPrompt { get; init; }
|
||||
|
||||
public AIServiceUsage Usage { get; set; } = AIServiceUsage.None;
|
||||
|
||||
@@ -64,21 +64,13 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
|
||||
var prompt = request.Prompt;
|
||||
var inputText = request.InputText;
|
||||
if (string.IsNullOrWhiteSpace(prompt) || string.IsNullOrWhiteSpace(inputText))
|
||||
var imageBytes = request.ImageBytes;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(prompt) || (string.IsNullOrWhiteSpace(inputText) && imageBytes is null))
|
||||
{
|
||||
throw new ArgumentException("Prompt and input text must be provided", nameof(request));
|
||||
throw new ArgumentException("Prompt and input content must be provided", nameof(request));
|
||||
}
|
||||
|
||||
var userMessageContent = $"""
|
||||
User instructions:
|
||||
{prompt}
|
||||
|
||||
Clipboard Content:
|
||||
{inputText}
|
||||
|
||||
Output:
|
||||
""";
|
||||
|
||||
var executionSettings = CreateExecutionSettings();
|
||||
var kernel = CreateKernel();
|
||||
var modelId = _config.Model;
|
||||
@@ -102,7 +94,32 @@ namespace AdvancedPaste.Services.CustomActions
|
||||
|
||||
var chatHistory = new ChatHistory();
|
||||
chatHistory.AddSystemMessage(systemPrompt);
|
||||
chatHistory.AddUserMessage(userMessageContent);
|
||||
|
||||
if (imageBytes != null)
|
||||
{
|
||||
var collection = new ChatMessageContentItemCollection();
|
||||
if (!string.IsNullOrWhiteSpace(inputText))
|
||||
{
|
||||
collection.Add(new TextContent($"Clipboard Content:\n{inputText}"));
|
||||
}
|
||||
|
||||
collection.Add(new ImageContent(imageBytes, request.ImageMimeType ?? "image/png"));
|
||||
collection.Add(new TextContent($"User instructions:\n{prompt}\n\nOutput:"));
|
||||
chatHistory.AddUserMessage(collection);
|
||||
}
|
||||
else
|
||||
{
|
||||
var userMessageContent = $"""
|
||||
User instructions:
|
||||
{prompt}
|
||||
|
||||
Clipboard Content:
|
||||
{inputText}
|
||||
|
||||
Output:
|
||||
""";
|
||||
chatHistory.AddUserMessage(userMessageContent);
|
||||
}
|
||||
|
||||
var response = await chatService.GetChatMessageContentAsync(chatHistory, executionSettings, kernel, cancellationToken);
|
||||
chatHistory.Add(response);
|
||||
|
||||
@@ -67,12 +67,36 @@ public abstract class KernelServiceBase(
|
||||
|
||||
LogResult(cacheUsed, isSavedQuery, kernel.GetOrAddActionChain(), usage);
|
||||
|
||||
var outputPackage = kernel.GetDataPackage();
|
||||
var hasUsableData = await outputPackage.GetView().HasUsableDataAsync();
|
||||
|
||||
if (kernel.GetLastError() is Exception ex)
|
||||
{
|
||||
throw ex;
|
||||
// If we have an error, but the AI provided a final text response, we can ignore the error (likely a tool failure that the AI handled).
|
||||
// However, if we have usable data (e.g. from a successful tool call before the error?), we might want to keep it?
|
||||
// In the case of ImageToText failure, outputPackage is empty (new DataPackage), hasUsableData is false.
|
||||
// So we check if there is a valid response in the chat history.
|
||||
var lastMessage = chatHistory.LastOrDefault();
|
||||
bool hasAssistantResponse = lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content);
|
||||
|
||||
if (!hasAssistantResponse && !hasUsableData)
|
||||
{
|
||||
throw ex;
|
||||
}
|
||||
|
||||
// If we have a response or data, we log the error but proceed.
|
||||
Logger.LogWarning($"Kernel operation encountered an error but proceeded with available response/data: {ex.Message}");
|
||||
}
|
||||
|
||||
var outputPackage = kernel.GetDataPackage();
|
||||
if (!hasUsableData)
|
||||
{
|
||||
var lastMessage = chatHistory.LastOrDefault();
|
||||
if (lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content))
|
||||
{
|
||||
outputPackage = DataPackageHelpers.CreateFromText(lastMessage.Content);
|
||||
kernel.SetDataPackage(outputPackage);
|
||||
}
|
||||
}
|
||||
|
||||
if (!(await outputPackage.GetView().HasUsableDataAsync()))
|
||||
{
|
||||
@@ -148,7 +172,21 @@ public abstract class KernelServiceBase(
|
||||
var systemPrompt = string.IsNullOrWhiteSpace(runtimeConfig.SystemPrompt) ? DefaultSystemPrompt : runtimeConfig.SystemPrompt;
|
||||
chatHistory.AddSystemMessage(systemPrompt);
|
||||
chatHistory.AddSystemMessage($"Available clipboard formats: {await kernel.GetDataFormatsAsync()}");
|
||||
chatHistory.AddUserMessage(prompt);
|
||||
|
||||
var imageBytes = await kernel.GetDataPackageView().GetImageAsPngBytesAsync();
|
||||
if (imageBytes != null)
|
||||
{
|
||||
var collection = new ChatMessageContentItemCollection
|
||||
{
|
||||
new TextContent(prompt),
|
||||
new ImageContent(imageBytes, "image/png"),
|
||||
};
|
||||
chatHistory.AddUserMessage(collection);
|
||||
}
|
||||
else
|
||||
{
|
||||
chatHistory.AddUserMessage(prompt);
|
||||
}
|
||||
|
||||
if (ShouldModerateAdvancedAI())
|
||||
{
|
||||
@@ -302,8 +340,16 @@ public abstract class KernelServiceBase(
|
||||
new ActionChainItem(PasteFormats.CustomTextTransformation, Arguments: new() { { PromptParameterName, fixedPrompt } }),
|
||||
async dataPackageView =>
|
||||
{
|
||||
var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
|
||||
var result = await _customActionTransformService.TransformTextAsync(fixedPrompt, input, kernel.GetCancellationToken(), kernel.GetProgress());
|
||||
var imageBytes = await dataPackageView.GetImageAsPngBytesAsync();
|
||||
var input = await dataPackageView.GetTextOrHtmlTextAsync();
|
||||
|
||||
if (string.IsNullOrEmpty(input) && imageBytes == null)
|
||||
{
|
||||
// If we have no text and no image, try to get text via OCR or throw if nothing exists
|
||||
input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
|
||||
}
|
||||
|
||||
var result = await _customActionTransformService.TransformAsync(fixedPrompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress());
|
||||
return DataPackageHelpers.CreateFromText(result?.Content ?? string.Empty);
|
||||
});
|
||||
|
||||
@@ -313,15 +359,22 @@ public abstract class KernelServiceBase(
|
||||
new ActionChainItem(format, Arguments: new() { { PromptParameterName, prompt } }),
|
||||
async dataPackageView =>
|
||||
{
|
||||
var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
|
||||
string output = await GetPromptBasedOutput(format, prompt, input, kernel.GetCancellationToken(), kernel.GetProgress());
|
||||
var imageBytes = await dataPackageView.GetImageAsPngBytesAsync();
|
||||
var input = await dataPackageView.GetTextOrHtmlTextAsync();
|
||||
|
||||
if (string.IsNullOrEmpty(input) && imageBytes == null)
|
||||
{
|
||||
input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
|
||||
}
|
||||
|
||||
string output = await GetPromptBasedOutput(format, prompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress());
|
||||
return DataPackageHelpers.CreateFromText(output);
|
||||
});
|
||||
|
||||
private async Task<string> GetPromptBasedOutput(PasteFormats format, string prompt, string input, CancellationToken cancellationToken, IProgress<double> progress) =>
|
||||
private async Task<string> GetPromptBasedOutput(PasteFormats format, string prompt, string input, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress) =>
|
||||
format switch
|
||||
{
|
||||
PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformTextAsync(prompt, input, cancellationToken, progress))?.Content ?? string.Empty,
|
||||
PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformAsync(prompt, input, imageBytes, cancellationToken, progress))?.Content ?? string.Empty,
|
||||
_ => throw new ArgumentException($"Unsupported format {format} for prompt transform", nameof(format)),
|
||||
};
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ public sealed class PasteFormatExecutor(IKernelService kernelService, ICustomAct
|
||||
pasteFormat.Format switch
|
||||
{
|
||||
PasteFormats.KernelQuery => await _kernelService.TransformClipboardAsync(pasteFormat.Prompt, clipboardData, pasteFormat.IsSavedQuery, cancellationToken, progress),
|
||||
PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformTextAsync(pasteFormat.Prompt, await clipboardData.GetClipboardTextOrThrowAsync(cancellationToken), cancellationToken, progress))?.Content ?? string.Empty),
|
||||
PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformAsync(pasteFormat.Prompt, await clipboardData.GetTextOrHtmlTextAsync(), await clipboardData.GetImageAsPngBytesAsync(), cancellationToken, progress))?.Content ?? string.Empty),
|
||||
_ => await TransformHelpers.TransformAsync(format, clipboardData, cancellationToken, progress),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ namespace AdvancedPaste.ViewModels
|
||||
private CancellationTokenSource _pasteActionCancellationTokenSource;
|
||||
|
||||
private string _currentClipboardHistoryId;
|
||||
private uint _lastClipboardSequenceNumber;
|
||||
private DateTimeOffset? _currentClipboardTimestamp;
|
||||
private ClipboardFormat _lastClipboardFormats = ClipboardFormat.None;
|
||||
private bool _clipboardHistoryUnavailableLogged;
|
||||
@@ -455,6 +456,7 @@ namespace AdvancedPaste.ViewModels
|
||||
{
|
||||
ResetClipboardPreview();
|
||||
_currentClipboardHistoryId = null;
|
||||
_lastClipboardSequenceNumber = 0;
|
||||
_currentClipboardTimestamp = null;
|
||||
_lastClipboardFormats = ClipboardFormat.None;
|
||||
return;
|
||||
@@ -477,6 +479,13 @@ namespace AdvancedPaste.ViewModels
|
||||
{
|
||||
bool clipboardChanged = formatsChanged;
|
||||
|
||||
var currentSequenceNumber = NativeMethods.GetClipboardSequenceNumber();
|
||||
if (_lastClipboardSequenceNumber != currentSequenceNumber)
|
||||
{
|
||||
clipboardChanged = true;
|
||||
_lastClipboardSequenceNumber = currentSequenceNumber;
|
||||
}
|
||||
|
||||
if (Clipboard.IsHistoryEnabled())
|
||||
{
|
||||
try
|
||||
|
||||
Reference in New Issue
Block a user