Compare commits

...

5 Commits

Author SHA1 Message Date
Shawn Yuan
e423c64b68 Refine KernelFunctionDescription for clarity 2025-12-15 11:29:57 +08:00
Shawn Yuan (from Dev Box)
f57e7cb9cd update comments
Signed-off-by: Shawn Yuan (from Dev Box) <shuaiyuan@microsoft.com>
2025-12-03 10:39:56 +08:00
Shawn Yuan (from Dev Box)
43be9f88ee resolve comments
Signed-off-by: Shawn Yuan (from Dev Box) <shuaiyuan@microsoft.com>
2025-12-03 10:33:52 +08:00
Shawn Yuan (from Dev Box)
f95e6c8f1b Introduced image-input handling in AP.
Signed-off-by: Shawn Yuan (from Dev Box) <shuaiyuan@microsoft.com>
2025-12-02 16:31:45 +08:00
Shawn Yuan
aaa6e03012 init
Signed-off-by: Shawn Yuan <shuaiyuan@microsoft.com>
2025-12-02 14:58:52 +08:00
11 changed files with 139 additions and 33 deletions

View File

@@ -144,7 +144,7 @@ public sealed class AIServiceBatchIntegrationTests
switch (format)
{
case PasteFormats.CustomTextTransformation:
var transformResult = await services.CustomActionTransformService.TransformTextAsync(batchTestInput.Prompt, batchTestInput.Clipboard, CancellationToken.None, progress);
var transformResult = await services.CustomActionTransformService.TransformAsync(batchTestInput.Prompt, batchTestInput.Clipboard, null, CancellationToken.None, progress);
return DataPackageHelpers.CreateFromText(transformResult.Content ?? string.Empty);
case PasteFormats.KernelQuery:

View File

@@ -225,6 +225,24 @@ internal static class DataPackageHelpers
internal static async Task<string> GetHtmlContentAsync(this DataPackageView dataPackageView) =>
dataPackageView.Contains(StandardDataFormats.Html) ? await dataPackageView.GetHtmlFormatAsync() : string.Empty;
internal static async Task<byte[]> GetImageAsPngBytesAsync(this DataPackageView dataPackageView)
{
var bitmap = await dataPackageView.GetImageContentAsync();
if (bitmap == null)
{
return null;
}
using var pngStream = new InMemoryRandomAccessStream();
var encoder = await BitmapEncoder.CreateAsync(BitmapEncoder.PngEncoderId, pngStream);
encoder.SetSoftwareBitmap(bitmap);
await encoder.FlushAsync();
using var memoryStream = new MemoryStream();
await pngStream.AsStreamForRead().CopyToAsync(memoryStream);
return memoryStream.ToArray();
}
internal static async Task<SoftwareBitmap> GetImageContentAsync(this DataPackageView dataPackageView)
{
using var stream = await dataPackageView.GetImageStreamAsync();

View File

@@ -166,5 +166,8 @@ namespace AdvancedPaste.Helpers
[DllImport("Shlwapi.dll", SetLastError = true, CharSet = CharSet.Unicode)]
internal static extern HResult AssocQueryString(AssocF flags, AssocStr str, string pszAssoc, string pszExtra, [Out] StringBuilder pszOut, [In][Out] ref uint pcchOut);
[DllImport("user32.dll", SetLastError = true)]
internal static extern uint GetClipboardSequenceNumber();
}
}

View File

@@ -46,7 +46,7 @@ public enum PasteFormats
CanPreview = true,
SupportedClipboardFormats = ClipboardFormat.Image,
IPCKey = AdvancedPasteAdditionalActions.PropertyNames.ImageToText,
KernelFunctionDescription = "Takes an image in the clipboard and extracts all text from it using OCR.")]
KernelFunctionDescription = "Takes an image from the clipboard and extracts text using OCR. This function is intended only for explicit text extraction or OCR requests.")]
ImageToText,
[PasteFormatMetadata(
@@ -118,8 +118,8 @@ public enum PasteFormats
IconGlyph = "\uE945",
RequiresAIService = true,
CanPreview = true,
SupportedClipboardFormats = ClipboardFormat.Text,
KernelFunctionDescription = "Takes input instructions and transforms clipboard text (not TXT files) with these input instructions, putting the result back on the clipboard. This uses AI to accomplish the task.",
SupportedClipboardFormats = ClipboardFormat.Text | ClipboardFormat.Image,
KernelFunctionDescription = "Takes user instructions and applies them to the current clipboard content (text or image). Use this function for image analysis, description, or transformation tasks beyond simple OCR.",
RequiresPrompt = true)]
CustomTextTransformation,
}

View File

@@ -40,15 +40,15 @@ namespace AdvancedPaste.Services.CustomActions
this.userSettings = userSettings;
}
public async Task<CustomActionTransformResult> TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress<double> progress)
public async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress)
{
var pasteConfig = userSettings?.PasteAIConfiguration;
var providerConfig = BuildProviderConfig(pasteConfig);
return await TransformAsync(prompt, inputText, providerConfig, cancellationToken, progress);
return await TransformAsync(prompt, inputText, imageBytes, providerConfig, cancellationToken, progress);
}
private async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress<double> progress)
private async Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, PasteAIConfig providerConfig, CancellationToken cancellationToken, IProgress<double> progress)
{
ArgumentNullException.ThrowIfNull(providerConfig);
@@ -57,9 +57,9 @@ namespace AdvancedPaste.Services.CustomActions
return new CustomActionTransformResult(string.Empty, AIServiceUsage.None);
}
if (string.IsNullOrWhiteSpace(inputText))
if (string.IsNullOrWhiteSpace(inputText) && imageBytes is null)
{
Logger.LogWarning("Clipboard has no usable text data");
Logger.LogWarning("Clipboard has no usable data");
return new CustomActionTransformResult(string.Empty, AIServiceUsage.None);
}
@@ -80,6 +80,8 @@ namespace AdvancedPaste.Services.CustomActions
{
Prompt = prompt,
InputText = inputText,
ImageBytes = imageBytes,
ImageMimeType = imageBytes != null ? "image/png" : null,
SystemPrompt = systemPrompt,
};

View File

@@ -12,6 +12,6 @@ namespace AdvancedPaste.Services.CustomActions
{
public interface ICustomActionTransformService
{
Task<CustomActionTransformResult> TransformTextAsync(string prompt, string inputText, CancellationToken cancellationToken, IProgress<double> progress);
Task<CustomActionTransformResult> TransformAsync(string prompt, string inputText, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress);
}
}

View File

@@ -12,6 +12,10 @@ namespace AdvancedPaste.Services.CustomActions
public string InputText { get; init; }
public byte[] ImageBytes { get; init; }
public string ImageMimeType { get; init; }
public string SystemPrompt { get; init; }
public AIServiceUsage Usage { get; set; } = AIServiceUsage.None;

View File

@@ -64,21 +64,13 @@ namespace AdvancedPaste.Services.CustomActions
var prompt = request.Prompt;
var inputText = request.InputText;
if (string.IsNullOrWhiteSpace(prompt) || string.IsNullOrWhiteSpace(inputText))
var imageBytes = request.ImageBytes;
if (string.IsNullOrWhiteSpace(prompt) || (string.IsNullOrWhiteSpace(inputText) && imageBytes is null))
{
throw new ArgumentException("Prompt and input text must be provided", nameof(request));
throw new ArgumentException("Prompt and input content must be provided", nameof(request));
}
var userMessageContent = $"""
User instructions:
{prompt}
Clipboard Content:
{inputText}
Output:
""";
var executionSettings = CreateExecutionSettings();
var kernel = CreateKernel();
var modelId = _config.Model;
@@ -102,7 +94,32 @@ namespace AdvancedPaste.Services.CustomActions
var chatHistory = new ChatHistory();
chatHistory.AddSystemMessage(systemPrompt);
chatHistory.AddUserMessage(userMessageContent);
if (imageBytes != null)
{
var collection = new ChatMessageContentItemCollection();
if (!string.IsNullOrWhiteSpace(inputText))
{
collection.Add(new TextContent($"Clipboard Content:\n{inputText}"));
}
collection.Add(new ImageContent(imageBytes, request.ImageMimeType ?? "image/png"));
collection.Add(new TextContent($"User instructions:\n{prompt}\n\nOutput:"));
chatHistory.AddUserMessage(collection);
}
else
{
var userMessageContent = $"""
User instructions:
{prompt}
Clipboard Content:
{inputText}
Output:
""";
chatHistory.AddUserMessage(userMessageContent);
}
var response = await chatService.GetChatMessageContentAsync(chatHistory, executionSettings, kernel, cancellationToken);
chatHistory.Add(response);

View File

@@ -67,12 +67,36 @@ public abstract class KernelServiceBase(
LogResult(cacheUsed, isSavedQuery, kernel.GetOrAddActionChain(), usage);
var outputPackage = kernel.GetDataPackage();
var hasUsableData = await outputPackage.GetView().HasUsableDataAsync();
if (kernel.GetLastError() is Exception ex)
{
throw ex;
// If we have an error, but the AI provided a final text response, we can ignore the error (likely a tool failure that the AI handled).
// However, if we have usable data (e.g. from a successful tool call before the error?), we might want to keep it?
// In the case of ImageToText failure, outputPackage is empty (new DataPackage), hasUsableData is false.
// So we check if there is a valid response in the chat history.
var lastMessage = chatHistory.LastOrDefault();
bool hasAssistantResponse = lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content);
if (!hasAssistantResponse && !hasUsableData)
{
throw ex;
}
// If we have a response or data, we log the error but proceed.
Logger.LogWarning($"Kernel operation encountered an error but proceeded with available response/data: {ex.Message}");
}
var outputPackage = kernel.GetDataPackage();
if (!hasUsableData)
{
var lastMessage = chatHistory.LastOrDefault();
if (lastMessage != null && lastMessage.Role == AuthorRole.Assistant && !string.IsNullOrEmpty(lastMessage.Content))
{
outputPackage = DataPackageHelpers.CreateFromText(lastMessage.Content);
kernel.SetDataPackage(outputPackage);
}
}
if (!(await outputPackage.GetView().HasUsableDataAsync()))
{
@@ -148,7 +172,21 @@ public abstract class KernelServiceBase(
var systemPrompt = string.IsNullOrWhiteSpace(runtimeConfig.SystemPrompt) ? DefaultSystemPrompt : runtimeConfig.SystemPrompt;
chatHistory.AddSystemMessage(systemPrompt);
chatHistory.AddSystemMessage($"Available clipboard formats: {await kernel.GetDataFormatsAsync()}");
chatHistory.AddUserMessage(prompt);
var imageBytes = await kernel.GetDataPackageView().GetImageAsPngBytesAsync();
if (imageBytes != null)
{
var collection = new ChatMessageContentItemCollection
{
new TextContent(prompt),
new ImageContent(imageBytes, "image/png"),
};
chatHistory.AddUserMessage(collection);
}
else
{
chatHistory.AddUserMessage(prompt);
}
if (ShouldModerateAdvancedAI())
{
@@ -302,8 +340,16 @@ public abstract class KernelServiceBase(
new ActionChainItem(PasteFormats.CustomTextTransformation, Arguments: new() { { PromptParameterName, fixedPrompt } }),
async dataPackageView =>
{
var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
var result = await _customActionTransformService.TransformTextAsync(fixedPrompt, input, kernel.GetCancellationToken(), kernel.GetProgress());
var imageBytes = await dataPackageView.GetImageAsPngBytesAsync();
var input = await dataPackageView.GetTextOrHtmlTextAsync();
if (string.IsNullOrEmpty(input) && imageBytes == null)
{
// If we have no text and no image, try to get text via OCR or throw if nothing exists
input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
}
var result = await _customActionTransformService.TransformAsync(fixedPrompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress());
return DataPackageHelpers.CreateFromText(result?.Content ?? string.Empty);
});
@@ -313,15 +359,22 @@ public abstract class KernelServiceBase(
new ActionChainItem(format, Arguments: new() { { PromptParameterName, prompt } }),
async dataPackageView =>
{
var input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
string output = await GetPromptBasedOutput(format, prompt, input, kernel.GetCancellationToken(), kernel.GetProgress());
var imageBytes = await dataPackageView.GetImageAsPngBytesAsync();
var input = await dataPackageView.GetTextOrHtmlTextAsync();
if (string.IsNullOrEmpty(input) && imageBytes == null)
{
input = await dataPackageView.GetClipboardTextOrThrowAsync(kernel.GetCancellationToken());
}
string output = await GetPromptBasedOutput(format, prompt, input, imageBytes, kernel.GetCancellationToken(), kernel.GetProgress());
return DataPackageHelpers.CreateFromText(output);
});
private async Task<string> GetPromptBasedOutput(PasteFormats format, string prompt, string input, CancellationToken cancellationToken, IProgress<double> progress) =>
private async Task<string> GetPromptBasedOutput(PasteFormats format, string prompt, string input, byte[] imageBytes, CancellationToken cancellationToken, IProgress<double> progress) =>
format switch
{
PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformTextAsync(prompt, input, cancellationToken, progress))?.Content ?? string.Empty,
PasteFormats.CustomTextTransformation => (await _customActionTransformService.TransformAsync(prompt, input, imageBytes, cancellationToken, progress))?.Content ?? string.Empty,
_ => throw new ArgumentException($"Unsupported format {format} for prompt transform", nameof(format)),
};

View File

@@ -37,7 +37,7 @@ public sealed class PasteFormatExecutor(IKernelService kernelService, ICustomAct
pasteFormat.Format switch
{
PasteFormats.KernelQuery => await _kernelService.TransformClipboardAsync(pasteFormat.Prompt, clipboardData, pasteFormat.IsSavedQuery, cancellationToken, progress),
PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformTextAsync(pasteFormat.Prompt, await clipboardData.GetClipboardTextOrThrowAsync(cancellationToken), cancellationToken, progress))?.Content ?? string.Empty),
PasteFormats.CustomTextTransformation => DataPackageHelpers.CreateFromText((await _customActionTransformService.TransformAsync(pasteFormat.Prompt, await clipboardData.GetTextOrHtmlTextAsync(), await clipboardData.GetImageAsPngBytesAsync(), cancellationToken, progress))?.Content ?? string.Empty),
_ => await TransformHelpers.TransformAsync(format, clipboardData, cancellationToken, progress),
});
}

View File

@@ -45,6 +45,7 @@ namespace AdvancedPaste.ViewModels
private CancellationTokenSource _pasteActionCancellationTokenSource;
private string _currentClipboardHistoryId;
private uint _lastClipboardSequenceNumber;
private DateTimeOffset? _currentClipboardTimestamp;
private ClipboardFormat _lastClipboardFormats = ClipboardFormat.None;
private bool _clipboardHistoryUnavailableLogged;
@@ -455,6 +456,7 @@ namespace AdvancedPaste.ViewModels
{
ResetClipboardPreview();
_currentClipboardHistoryId = null;
_lastClipboardSequenceNumber = 0;
_currentClipboardTimestamp = null;
_lastClipboardFormats = ClipboardFormat.None;
return;
@@ -477,6 +479,13 @@ namespace AdvancedPaste.ViewModels
{
bool clipboardChanged = formatsChanged;
var currentSequenceNumber = NativeMethods.GetClipboardSequenceNumber();
if (_lastClipboardSequenceNumber != currentSequenceNumber)
{
clipboardChanged = true;
_lastClipboardSequenceNumber = currentSequenceNumber;
}
if (Clipboard.IsHistoryEnabled())
{
try