diff --git a/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs b/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs index 17b8139bad..1f7829a0bd 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste.UnitTests/ServicesTests/AIServiceBatchIntegrationTests.cs @@ -144,7 +144,7 @@ public sealed class AIServiceBatchIntegrationTests switch (format) { case PasteFormats.CustomTextTransformation: - var transformResult = await services.CustomActionTransformService.TransformTextAsync(batchTestInput.Prompt, batchTestInput.Clipboard, CancellationToken.None, progress); + var transformResult = await services.CustomActionTransformService.TransformAsync(batchTestInput.Prompt, batchTestInput.Clipboard, null, CancellationToken.None, progress); return DataPackageHelpers.CreateFromText(transformResult.Content ?? string.Empty); case PasteFormats.KernelQuery: diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs index 2cd7554a50..f5439aecf1 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/DataPackageHelpers.cs @@ -225,6 +225,24 @@ internal static class DataPackageHelpers internal static async Task GetHtmlContentAsync(this DataPackageView dataPackageView) => dataPackageView.Contains(StandardDataFormats.Html) ? await dataPackageView.GetHtmlFormatAsync() : string.Empty; + internal static async Task GetImageAsPngBytesAsync(this DataPackageView dataPackageView) + { + var bitmap = await dataPackageView.GetImageContentAsync(); + if (bitmap == null) + { + return null; + } + + using var pngStream = new InMemoryRandomAccessStream(); + var encoder = await BitmapEncoder.CreateAsync(BitmapEncoder.PngEncoderId, pngStream); + encoder.SetSoftwareBitmap(bitmap); + await encoder.FlushAsync(); + + using var memoryStream = new MemoryStream(); + await pngStream.AsStreamForRead().CopyToAsync(memoryStream); + return memoryStream.ToArray(); + } + internal static async Task GetImageContentAsync(this DataPackageView dataPackageView) { using var stream = await dataPackageView.GetImageStreamAsync(); diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs index 6e53e9b618..08293d4be0 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/NativeMethods.cs @@ -166,5 +166,8 @@ namespace AdvancedPaste.Helpers [DllImport("Shlwapi.dll", SetLastError = true, CharSet = CharSet.Unicode)] internal static extern HResult AssocQueryString(AssocF flags, AssocStr str, string pszAssoc, string pszExtra, [Out] StringBuilder pszOut, [In][Out] ref uint pcchOut); + + [DllImport("user32.dll", SetLastError = true)] + internal static extern uint GetClipboardSequenceNumber(); } } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs b/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs index 99243ebb5e..1479912e66 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Models/PasteFormats.cs @@ -46,7 +46,7 @@ public enum PasteFormats CanPreview = true, SupportedClipboardFormats = ClipboardFormat.Image, IPCKey = AdvancedPasteAdditionalActions.PropertyNames.ImageToText, - KernelFunctionDescription = "Takes an image in the clipboard and extracts all text from it using OCR.")] + KernelFunctionDescription = "Takes an image from the clipboard and extracts text using OCR. This function is intended only for explicit text extraction or OCR requests.")] ImageToText, [PasteFormatMetadata( @@ -118,8 +118,8 @@ public enum PasteFormats IconGlyph = "\uE945", RequiresAIService = true, CanPreview = true, - SupportedClipboardFormats = ClipboardFormat.Text, - KernelFunctionDescription = "Takes input instructions and transforms clipboard text (not TXT files) with these input instructions, putting the result back on the clipboard. This uses AI to accomplish the task.", + SupportedClipboardFormats = ClipboardFormat.Text | ClipboardFormat.Image, + KernelFunctionDescription = "Takes user instructions and applies them to the current clipboard content (text or image). Use this function for image analysis, description, or transformation tasks beyond simple OCR.", RequiresPrompt = true)] CustomTextTransformation, } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs index 57d55492a4..05cdcbe81f 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/CustomActionTransformService.cs @@ -40,15 +40,15 @@ namespace AdvancedPaste.Services.CustomActions this.userSettings = userSettings; } - public async Task TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress progress) + public async Task TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress) { var pasteConfig = userSettings?.PasteAIConfiguration; var providerConfig = BuildProviderConfig(pasteConfig); - return await TransformAsync(prompt, inputText, providerConfig, cancellationToken, progress); + return await TransformAsync(prompt, inputText, imageBytes, providerConfig, cancellationToken, progress); } - private async Task TransformAsync(string prompt, string inputText, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress progress) + private async Task TransformAsync(string prompt, string inputText, byte[] imageBytes, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress progress) { ArgumentNullException.ThrowIfNull(providerConfig); @@ -57,9 +57,9 @@ namespace AdvancedPaste.Services.CustomActions return new CustomActionTransformResult(string.Empty, AIServiceUsage.None); } - if (string.IsNullOrWhiteSpace(inputText)) + if (string.IsNullOrWhiteSpace(inputText) && imageBytes is null) { - Logger.LogWarning("Clipboard has no usable text data"); + Logger.LogWarning("Clipboard has no usable data"); return new CustomActionTransformResult(string.Empty, AIServiceUsage.None); } @@ -80,6 +80,8 @@ namespace AdvancedPaste.Services.CustomActions { Prompt = prompt, InputText = inputText, + ImageBytes = imageBytes, + ImageMimeType = imageBytes != null ? "image/png" : null, SystemPrompt = systemPrompt, }; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs index 1c3ecb980c..564db3fdc5 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/ICustomActionTransformService.cs @@ -12,6 +12,6 @@ namespace AdvancedPaste.Services.CustomActions { public interface ICustomActionTransformService { - Task TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress progress); + Task TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress); } } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs index 0e15c93e05..96dabbfa05 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/PasteAIRequest.cs @@ -12,6 +12,10 @@ namespace AdvancedPaste.Services.CustomActions public string InputText { get; init; } + public byte[] ImageBytes { get; init; } + + public string ImageMimeType { get; init; } + public string SystemPrompt { get; init; } public AIServiceUsage Usage { get; set; } = AIServiceUsage.None; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs index eb2f56e01f..636d2e3e78 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/CustomActions/SemanticKernelPasteProvider.cs @@ -64,21 +64,13 @@ namespace AdvancedPaste.Services.CustomActions var prompt = request.Prompt; var inputText = request.InputText; - if (string.IsNullOrWhiteSpace(prompt) || string.IsNullOrWhiteSpace(inputText)) + var imageBytes = request.ImageBytes; + + if (string.IsNullOrWhiteSpace(prompt) || (string.IsNullOrWhiteSpace(inputText) && imageBytes is null)) { - throw new ArgumentException("Prompt and input text must be provided", nameof(request)); + throw new ArgumentException("Prompt and input content must be provided", nameof(request)); } - var userMessageContent = $""" - User instructions: - {prompt} - - Clipboard Content: - {inputText} - - Output: - """; - var executionSettings = CreateExecutionSettings(); var kernel = CreateKernel(); var modelId = _config.Model; @@ -102,7 +94,32 @@ namespace AdvancedPaste.Services.CustomActions var chatHistory = new ChatHistory(); chatHistory.AddSystemMessage(systemPrompt); - chatHistory.AddUserMessage(userMessageContent); + + if (imageBytes != null) + { + var collection = new ChatMessageContentItemCollection(); + if (!string.IsNullOrWhiteSpace(inputText)) + { + collection.Add(new TextContent($"Clipboard Content:\n{inputText}")); + } + + collection.Add(new ImageContent(imageBytes, request.ImageMimeType ?? "image/png")); + collection.Add(new TextContent($"User instructions:\n{prompt}\n\nOutput:")); + chatHistory.AddUserMessage(collection); + } + else + { + var userMessageContent = $""" + User instructions: + {prompt} + + Clipboard Content: + {inputText} + + Output: + """; + chatHistory.AddUserMessage(userMessageContent); + } var response = await chatService.GetChatMessageContentAsync(chatHistory, executionSettings, kernel, cancellationToken); chatHistory.Add(response); diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs index 47e208eb49..0d753d1ec3 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/KernelServiceBase.cs @@ -67,12 +67,36 @@ public abstract class KernelServiceBase( LogResult(cacheUsed, isSavedQuery, kernel.GetOrAddActionChain(), usage); + var outputPackage = kernel.GetDataPackage(); + var hasUsableData = await outputPackage.GetView().HasUsableDataAsync(); + if (kernel.GetLastError() is Exception ex) { - throw ex; + // If we have an error, but the AI provided a final text response, we can ignore the error (likely a tool failure that the AI handled). + // However, if we have usable data (e.g. from a successful tool call before the error?), we might want to keep it? + // In the case of ImageToText failure, outputPackage is empty (new DataPackage), hasUsableData is false. + // So we check if there is a valid response in the chat history. + var lastMessage = chatHistory.LastOrDefault(); + bool hasAssistantResponse = lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content); + + if (!hasAssistantResponse && !hasUsableData) + { + throw ex; + } + + // If we have a response or data, we log the error but proceed. + Logger.LogWarning($"Kernel operation encountered an error but proceeded with available response/data: {ex.Message}"); } - var outputPackage = kernel.GetDataPackage(); + if (!hasUsableData) + { + var lastMessage = chatHistory.LastOrDefault(); + if (lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content)) + { + outputPackage = DataPackageHelpers.CreateFromText(lastMessage.Content); + kernel.SetDataPackage(outputPackage); + } + } if (!(await outputPackage.GetView().HasUsableDataAsync())) { @@ -148,7 +172,21 @@ public abstract class KernelServiceBase( var systemPrompt = string.IsNullOrWhiteSpace(runtimeConfig.SystemPrompt) ? DefaultSystemPrompt : runtimeConfig.SystemPrompt; chatHistory.AddSystemMessage(systemPrompt); chatHistory.AddSystemMessage($"Available clipboard formats: {await kernel.GetDataFormatsAsync()}"); - chatHistory.AddUserMessage(prompt); + + var imageBytes = await kernel.GetDataPackageView().GetImageAsPngBytesAsync(); + if (imageBytes != null) + { + var collection = new ChatMessageContentItemCollection + { + new TextContent(prompt), + new ImageContent(imageBytes, "image/png"), + }; + chatHistory.AddUserMessage(collection); + } + else + { + chatHistory.AddUserMessage(prompt); + } if (ShouldModerateAdvancedAI()) { @@ -302,8 +340,16 @@ public abstract class KernelServiceBase( new ActionChainItem(PasteFormats.CustomTextTransformation, Arguments: new() { { PromptParameterName, fixedPrompt } }), async dataPackageView => { - var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); - var result = await _customActionTransformService.TransformTextAsync(fixedPrompt, input, kernel.GetCancellationToken(), kernel.GetProgress()); + var imageBytes = await dataPackageView.GetImageAsPngBytesAsync(); + var input = await dataPackageView.GetTextOrHtmlTextAsync(); + + if (string.IsNullOrEmpty(input) && imageBytes == null) + { + // If we have no text and no image, try to get text via OCR or throw if nothing exists + input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); + } + + var result = await _customActionTransformService.TransformAsync(fixedPrompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress()); return DataPackageHelpers.CreateFromText(result?.Content ?? string.Empty); }); @@ -313,15 +359,22 @@ public abstract class KernelServiceBase( new ActionChainItem(format, Arguments: new() { { PromptParameterName, prompt } }), async dataPackageView => { - var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); - string output = await GetPromptBasedOutput(format, prompt, input, kernel.GetCancellationToken(), kernel.GetProgress()); + var imageBytes = await dataPackageView.GetImageAsPngBytesAsync(); + var input = await dataPackageView.GetTextOrHtmlTextAsync(); + + if (string.IsNullOrEmpty(input) && imageBytes == null) + { + input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken()); + } + + string output = await GetPromptBasedOutput(format, prompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress()); return DataPackageHelpers.CreateFromText(output); }); - private async Task GetPromptBasedOutput(PasteFormats format, string prompt, string input, CancellationToken cancellationToken, IProgress progress) => + private async Task GetPromptBasedOutput(PasteFormats format, string prompt, string input, byte[] imageBytes, CancellationToken cancellationToken, IProgress progress) => format switch { - PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformTextAsync(prompt, input, cancellationToken, progress))?.Content ?? string.Empty, + PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformAsync(prompt, input, imageBytes, cancellationToken, progress))?.Content ?? string.Empty, _ => throw new ArgumentException($"Unsupported format {format} for prompt transform", nameof(format)), }; diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs b/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs index aef9e39bb9..ff64a5ad83 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Services/PasteFormatExecutor.cs @@ -37,7 +37,7 @@ public sealed class PasteFormatExecutor(IKernelService kernelService, ICustomAct pasteFormat.Format switch { PasteFormats.KernelQuery => await _kernelService.TransformClipboardAsync(pasteFormat.Prompt, clipboardData, pasteFormat.IsSavedQuery, cancellationToken, progress), - PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformTextAsync(pasteFormat.Prompt, await clipboardData.GetClipboardTextOrThrowAsync(cancellationToken), cancellationToken, progress))?.Content ?? string.Empty), + PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformAsync(pasteFormat.Prompt, await clipboardData.GetTextOrHtmlTextAsync(), await clipboardData.GetImageAsPngBytesAsync(), cancellationToken, progress))?.Content ?? string.Empty), _ => await TransformHelpers.TransformAsync(format, clipboardData, cancellationToken, progress), }); } diff --git a/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs b/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs index 8edd9b76ad..b055d46457 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/ViewModels/OptionsViewModel.cs @@ -45,6 +45,7 @@ namespace AdvancedPaste.ViewModels private CancellationTokenSource _pasteActionCancellationTokenSource; private string _currentClipboardHistoryId; + private uint _lastClipboardSequenceNumber; private DateTimeOffset? _currentClipboardTimestamp; private ClipboardFormat _lastClipboardFormats = ClipboardFormat.None; private bool _clipboardHistoryUnavailableLogged; @@ -455,6 +456,7 @@ namespace AdvancedPaste.ViewModels { ResetClipboardPreview(); _currentClipboardHistoryId = null; + _lastClipboardSequenceNumber = 0; _currentClipboardTimestamp = null; _lastClipboardFormats = ClipboardFormat.None; return; @@ -477,6 +479,13 @@ namespace AdvancedPaste.ViewModels { bool clipboardChanged = formatsChanged; + var currentSequenceNumber = NativeMethods.GetClipboardSequenceNumber(); + if (_lastClipboardSequenceNumber != currentSequenceNumber) + { + clipboardChanged = true; + _lastClipboardSequenceNumber = currentSequenceNumber; + } + if (Clipboard.IsHistoryEnabled()) { try