mirror of
https://github.com/microsoft/PowerToys.git
synced 2026-04-03 17:56:44 +02:00
[TextExtractor]Fix error blanks in Japanese OCR (#22443)
* fix error blanks in japanese OCR Kanji ,Hiragana, Katakana, Hankaku-Katakana do not need blank. (not only the range of CJKUnifiedIdeographs). Maybe there are more symbols that don't require spaces like \u3001 \u3002. But give it to ocr engine to improve may be a better choice ? * Update ImageMethods.cs fixing spelling * Update expect.txt adding in Hankaku * Update ImageMethods.cs
This commit is contained in:
@@ -147,7 +147,10 @@ internal class ImageMethods
|
||||
}
|
||||
else
|
||||
{
|
||||
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}");
|
||||
// Kanji, Hiragana, Katakana, Hankaku-Katakana do not need blank.(not only the symbol in CJKUnifiedIdeographs).
|
||||
// Maybe there are more symbols that don't require spaces like \u3001 \u3002.
|
||||
// var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}|\p{IsHiragana}|\p{IsKatakana}|[\uFF61-\uFF9F]|[\u3000-\u3003]");
|
||||
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}|\p{IsHiragana}|\p{IsKatakana}|[\uFF61-\uFF9F]");
|
||||
|
||||
foreach (OcrLine ocrLine in ocrResult.Lines)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user