mirror of
https://github.com/microsoft/PowerToys.git
synced 2026-04-09 20:57:22 +02:00
[TextExtractor] Add space between CJK words and non-CJK (#20926)
* [TextExtractor] add space between CJK words and non-CJK
This commit is contained in:
@@ -10,6 +10,7 @@ using System.Globalization;
|
|||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using System.Windows;
|
using System.Windows;
|
||||||
using System.Windows.Input;
|
using System.Windows.Input;
|
||||||
@@ -146,11 +147,25 @@ internal class ImageMethods
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}");
|
||||||
|
|
||||||
foreach (OcrLine ocrLine in ocrResult.Lines)
|
foreach (OcrLine ocrLine in ocrResult.Lines)
|
||||||
{
|
{
|
||||||
|
bool isBeginning = true;
|
||||||
|
bool isCJKPrev = false;
|
||||||
foreach (OcrWord ocrWord in ocrLine.Words)
|
foreach (OcrWord ocrWord in ocrLine.Words)
|
||||||
{
|
{
|
||||||
|
bool isCJK = cjkRegex.IsMatch(ocrWord.Text);
|
||||||
|
|
||||||
|
// Use spaces to separate non-CJK words.
|
||||||
|
if (!isBeginning && (!isCJK || !isCJKPrev))
|
||||||
|
{
|
||||||
|
_ = text.Append(' ');
|
||||||
|
}
|
||||||
|
|
||||||
_ = text.Append(ocrWord.Text);
|
_ = text.Append(ocrWord.Text);
|
||||||
|
isCJKPrev = isCJK;
|
||||||
|
isBeginning = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
text.Append(Environment.NewLine);
|
text.Append(Environment.NewLine);
|
||||||
|
|||||||
Reference in New Issue
Block a user