mirror of
https://github.com/microsoft/PowerToys.git
synced 2025-12-16 11:48:06 +01:00
[TextExtractor]Fix error blanks in Japanese OCR (#22443)
* fix error blanks in japanese OCR Kanji ,Hiragana, Katakana, Hankaku-Katakana do not need blank. (not only the range of CJKUnifiedIdeographs). Maybe there are more symbols that don't require spaces like \u3001 \u3002. But give it to ocr engine to improve may be a better choice ? * Update ImageMethods.cs fixing spelling * Update expect.txt adding in Hankaku * Update ImageMethods.cs
This commit is contained in:
1
.github/actions/spell-check/expect.txt
vendored
1
.github/actions/spell-check/expect.txt
vendored
@@ -618,6 +618,7 @@ HACCEL
|
|||||||
handlekeyboardhookevent
|
handlekeyboardhookevent
|
||||||
handlerroutine
|
handlerroutine
|
||||||
hangeul
|
hangeul
|
||||||
|
Hankaku
|
||||||
hanselman
|
hanselman
|
||||||
Hanzi
|
Hanzi
|
||||||
Hardlines
|
Hardlines
|
||||||
|
|||||||
@@ -147,7 +147,10 @@ internal class ImageMethods
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}");
|
// Kanji, Hiragana, Katakana, Hankaku-Katakana do not need blank.(not only the symbol in CJKUnifiedIdeographs).
|
||||||
|
// Maybe there are more symbols that don't require spaces like \u3001 \u3002.
|
||||||
|
// var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}|\p{IsHiragana}|\p{IsKatakana}|[\uFF61-\uFF9F]|[\u3000-\u3003]");
|
||||||
|
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}|\p{IsHiragana}|\p{IsKatakana}|[\uFF61-\uFF9F]");
|
||||||
|
|
||||||
foreach (OcrLine ocrLine in ocrResult.Lines)
|
foreach (OcrLine ocrLine in ocrResult.Lines)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user