[AdvPaste]Fix CSV parser supporting escape delimiter by enclosing in double quotes (#33874)

## Summary of the Pull Request
This PR fixes the CSV parser support for escaping delimiter by enclosing
it in quotes

## Detailed Description of the Pull Request / Additional comments
- This PR introduces a fix for the support of adding a delimiter to the
string and supporting it by enclosing it in `"`
This commit is contained in:
Vaibhav Sharma
2024-07-23 18:59:33 +05:30
committed by GitHub
parent 07c4972c2c
commit 3652e3627a
2 changed files with 13 additions and 8 deletions

View File

@@ -1064,6 +1064,7 @@ numberbox
nwc nwc
Objbase Objbase
objidl objidl
occurrence
ocr ocr
Ocrsettings Ocrsettings
odbccp odbccp

View File

@@ -24,6 +24,9 @@ namespace AdvancedPaste.Helpers
private static readonly char[] CsvDelimArry = [',', ';', '\t']; private static readonly char[] CsvDelimArry = [',', ';', '\t'];
private static readonly Regex CsvSepIdentifierRegex = new Regex(@"^sep=(.)$", RegexOptions.IgnoreCase); private static readonly Regex CsvSepIdentifierRegex = new Regex(@"^sep=(.)$", RegexOptions.IgnoreCase);
// Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped "
private static readonly string CsvDelimSepRegexStr = @"(?=(?:[^""]*""[^""]*"")*(?![^""]*""))";
internal static string ToJsonFromXmlOrCsv(DataPackageView clipboardData) internal static string ToJsonFromXmlOrCsv(DataPackageView clipboardData)
{ {
Logger.LogTrace(); Logger.LogTrace();
@@ -146,10 +149,11 @@ namespace AdvancedPaste.Helpers
continue; continue;
} }
// A CSV line is valid, if the delimiter occurs more or equal times in every line compared to the first data line. (More because sometimes the delimiter occurs in a data string.) // A CSV line is valid, if the delimiter occurs equal times in every line compared to the first data line
if (line.Count(x => x == delim) >= delimCount) // and if every line contains no or an even count of quotation marks.
if (Regex.Count(line, delim + CsvDelimSepRegexStr) == delimCount && int.IsEvenInteger(line.Count(x => x == '"')))
{ {
csv.Add(line.Split(delim)); csv.Add(Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase));
} }
else else
{ {
@@ -205,7 +209,7 @@ namespace AdvancedPaste.Helpers
// We get the count from the second line, as the first one only contains the character definition and not a CSV data line. // We get the count from the second line, as the first one only contains the character definition and not a CSV data line.
char delimChar = matchChar.Groups[1].Value.Trim()[0]; char delimChar = matchChar.Groups[1].Value.Trim()[0];
delimiter = delimChar; delimiter = delimChar;
delimiterCount = csvLines[1].Count(x => x == delimChar); delimiterCount = Regex.Count(csvLines[1], delimChar + CsvDelimSepRegexStr, RegexOptions.IgnoreCase);
} }
} }
@@ -214,19 +218,19 @@ namespace AdvancedPaste.Helpers
// Try to select the correct delimiter based on the first two CSV lines from a list of predefined delimiters. // Try to select the correct delimiter based on the first two CSV lines from a list of predefined delimiters.
foreach (char c in CsvDelimArry) foreach (char c in CsvDelimArry)
{ {
int cntFirstLine = csvLines[0].Count(x => x == c); int cntFirstLine = Regex.Count(csvLines[0], c + CsvDelimSepRegexStr, RegexOptions.IgnoreCase);
int cntNextLine = 0; // Default to 0 that the 'second line' check is always true. int cntNextLine = 0; // Default to 0 that the 'second line' check is always true.
// Additional count if we have more than one line // Additional count if we have more than one line
if (csvLines.Length >= 2) if (csvLines.Length >= 2)
{ {
cntNextLine = csvLines[1].Count(x => x == c); cntNextLine = Regex.Count(csvLines[1], c + CsvDelimSepRegexStr, RegexOptions.IgnoreCase);
} }
// The delimiter is found if the count is bigger as from the last selected delimiter // The delimiter is found if the count is bigger as from the last selected delimiter
// and if the next csv line does not exist or has the same number or more occurrences of the delimiter. // and if the next csv line does not exist or has the same number of occurrences of the delimiter.
// (We check the next line to prevent false positives.) // (We check the next line to prevent false positives.)
if (cntFirstLine > delimiterCount && (cntNextLine == 0 || cntNextLine >= cntFirstLine)) if (cntFirstLine > delimiterCount && (cntNextLine == 0 || cntNextLine == cntFirstLine))
{ {
delimiter = c; delimiter = c;
delimiterCount = cntFirstLine; delimiterCount = cntFirstLine;