public SimpleTextParser(IPunctuationContainer punctuationContainer) { if (punctuationContainer == null) throw new ArgumentNullException(nameof(punctuationContainer)); var punctConCopy = punctuationContainer.Clone() as IPunctuationContainer; if (IsPunctuationInvalid(punctConCopy)) throw new FormatException("Punctuation must not contain spaces or be empty"); _punctuationContainer = punctConCopy; // set _textParserRegex var orderedSep = PunctuationContainer.SentencesSeparators.OrderByDescending(x => x.Length); string sep1 = String.Join(@"|", orderedSep.Select(Regex.Escape).ToArray()); string pattern = $"(.+?({sep1}))(?=(\\s+|$))"; _textParserRegex = new Regex(pattern); // set _sentenceParserRegex string sep2 = String.Join(@"", orderedSep.Select(Regex.Escape).ToArray()); orderedSep = PunctuationContainer.SyntacticConstructionsSeparators.OrderByDescending(x => x.Length); string sep3 = String.Join(@"|", orderedSep.Select(Regex.Escape).ToArray()); string sep4 = String.Join(@"", orderedSep.Select(Regex.Escape).ToArray()); pattern = $"(([^{sep4}{sep2}\\s]*[{sep2}]*[^{sep4}{sep2}\\s]+)|\\s+|{sep3}|{sep1})"; _sentenceParserRegex = new Regex(pattern); // set _expressionParserRegex pattern = $"(([^{sep4}{sep2}\\s]*[{sep2}]*[^{sep4}{sep2}\\s]+)|\\s+|{sep3}|{sep1})"; _expressionParserRegex = new Regex(pattern); }
private static bool IsPunctuationInvalid(IPunctuationContainer punctuation) { return punctuation.SentencesSeparators.Any(sep => sep == String.Empty || sep.Contains(" ")) || punctuation.SyntacticConstructionsSeparators.Any(sep => sep == String.Empty || sep.Contains(" ")); }