public static int IndexOfPunctuationMark(this string source, PunctuationMark punctuationMark) { var tempSource = source; var tempIndex = 0; if (tempSource.Contains(punctuationMark.StringValue)) { var punctuationIndex = tempSource.IndexOf(punctuationMark.StringValue); while (punctuationIndex != -1 && tempIndex < source.Length) { tempIndex += punctuationIndex; var leftPart = tempSource.Substring(0, punctuationIndex); var rightPart = tempSource.Substring(punctuationIndex + punctuationMark.StringValue.Length); var hasLeftSpace = string.IsNullOrEmpty(leftPart) || char.IsWhiteSpace(leftPart.Last()); var hasRightSpace = string.IsNullOrEmpty(rightPart) || char.IsWhiteSpace(rightPart.First()); switch (punctuationMark.InnerOption) { case SingleTextElementInnerOption.None: { if (!hasLeftSpace && !hasRightSpace) return tempIndex; break; } case SingleTextElementInnerOption.LeftSpace: { if (hasLeftSpace && !hasRightSpace) return tempIndex; break; } case SingleTextElementInnerOption.RightSpace: { if (!hasLeftSpace && hasRightSpace) return tempIndex; break; } case SingleTextElementInnerOption.BothSpace: { if (hasLeftSpace && hasRightSpace) return tempIndex; break; } } tempSource = tempSource.Substring(punctuationIndex + punctuationMark.StringValue.Length); tempIndex += punctuationMark.StringValue.Length; punctuationIndex = rightPart.IndexOf(punctuationMark.StringValue); } } return -1; }
public static IEnumerable<PunctuationMark> GetPunctuationMarks(this string source, IEnumerable<PunctuationMark> punctuationMarks) { // Ordered by string value collection, will give us ability to find composite punctuation marks first var orderedMarks = punctuationMarks.OrderByDescending(mark => mark.StringValue.Length); var punctuationString = string.Empty; var punctuationDetected = false; var leftPart = string.Empty; var rightPart = string.Empty; var hasLeftSpace = false; var hasRightSpace = false; var index = 0; foreach (var symbol in source) { if (char.IsPunctuation(symbol)) { if (!punctuationDetected) { leftPart = source.Substring(0, index); hasLeftSpace = string.IsNullOrEmpty(leftPart) || char.IsWhiteSpace(leftPart.Last()); } punctuationDetected = true; punctuationString += symbol; } else { if (punctuationDetected) { rightPart = source.Substring(index - 1 + punctuationString.Length); hasRightSpace = string.IsNullOrEmpty(rightPart) || char.IsWhiteSpace(rightPart.First()); var innerOption = hasLeftSpace && hasRightSpace ? SingleTextElementInnerOption.BothSpace : !hasLeftSpace && hasRightSpace ? SingleTextElementInnerOption.RightSpace : hasLeftSpace && !hasRightSpace ? SingleTextElementInnerOption.LeftSpace : SingleTextElementInnerOption.None; var punctuationMark = new PunctuationMark() { StringValue = punctuationString, InnerOption = innerOption }; if (orderedMarks.Any(mark => mark.StringValue == punctuationMark.StringValue && mark.InnerOption == punctuationMark.InnerOption)) { yield return orderedMarks.First(mark => mark.StringValue == punctuationMark.StringValue && mark.InnerOption == punctuationMark.InnerOption); } } punctuationString = string.Empty; punctuationDetected = false; } index++; } if (punctuationDetected) { rightPart = source.Substring(index); hasRightSpace = string.IsNullOrEmpty(rightPart) || char.IsWhiteSpace(rightPart.First()); var innerOption = hasLeftSpace && hasRightSpace ? SingleTextElementInnerOption.BothSpace : !hasLeftSpace && hasRightSpace ? SingleTextElementInnerOption.RightSpace : hasLeftSpace && !hasRightSpace ? SingleTextElementInnerOption.LeftSpace : SingleTextElementInnerOption.None; var punctuationMark = new PunctuationMark() { StringValue = punctuationString, InnerOption = innerOption }; if (orderedMarks.Any(mark => mark.StringValue == punctuationMark.StringValue && mark.InnerOption == punctuationMark.InnerOption)) { yield return orderedMarks.First(mark => mark.StringValue == punctuationMark.StringValue && mark.InnerOption == punctuationMark.InnerOption); } } }
public static bool ContainsPunctuationMark(this string source, PunctuationMark punctuationMark) { return source.IndexOfPunctuationMark(punctuationMark) != -1; }
public static Sentence ParseSentenceString(string source, PunctuationMark endMark) { var sentence = new Sentence(); // Get split parts of the sentence var splitParts = source.Split(new char[] { ' ' }); foreach (var part in splitParts) { var sPart = part; // Get inner punctuation marks in the split part var innerPunctuationMarks = sPart.GetPunctuationMarks(DefaultPunctuationMarks.InternalPunctuationMarks); if (innerPunctuationMarks.Any()) { var sentenceParts = new List<ISingleTextElement>(); var compositeWord = new CompositeWord(); // If split part contains punctuation marks, split this part to single text elements foreach (var mark in innerPunctuationMarks) { var markLenght = mark.StringValue.Length; var index = sPart.IndexOfPunctuationMark(mark); var leftPart = sPart.Substring(0, index); if (!string.IsNullOrEmpty(leftPart)) { var word = new Word() { StringValue = leftPart }; compositeWord.Add(word); if (!sentenceParts.Contains(compositeWord)) sentenceParts.Add(compositeWord); } if(index == 0) { sentenceParts.Add(mark); } else if(index + markLenght < sPart.Length) { compositeWord.Add(mark); if (!sentenceParts.Contains(compositeWord)) sentenceParts.Add(compositeWord); } else { sentenceParts.Add(mark); } sPart = sPart.Substring(index + markLenght); } if (!string.IsNullOrEmpty(sPart)) { var word = new Word() { StringValue = sPart }; compositeWord.Add(word); if (!sentenceParts.Contains(compositeWord)) sentenceParts.Add(compositeWord); } // Add single text elements to sentence foreach(var sentencePart in sentenceParts) { if (sentencePart == sentenceParts.Last()) sentencePart.InnerOption = SingleTextElementInnerOption.RightSpace; sentence.Add(sentencePart); } } else { var word = new Word() { StringValue = sPart, InnerOption = SingleTextElementInnerOption.RightSpace }; sentence.Add(word); } } if (endMark.HasValue) { if(sentence.Last().InnerOption == SingleTextElementInnerOption.RightSpace) { sentence.Last().InnerOption = SingleTextElementInnerOption.None; } sentence.Add(endMark); } return sentence; }
static DefaultPunctuationMarks() { _spaceMark = new PunctuationMark(" ", PunctuationMarkType.SpaceMark, SingleTextElementInnerOption.None); _terminalPunctuationMarks = new Collection<PunctuationMark>() { new PunctuationMark(".", PunctuationMarkType.EndMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("?", PunctuationMarkType.QuestionMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("!", PunctuationMarkType.ExclamationMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("...", PunctuationMarkType.UnfinishedMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("?!", PunctuationMarkType.AccentMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("!?", PunctuationMarkType.AccentMark, SingleTextElementInnerOption.RightSpace) }; _internalPunctuationMarks = new Collection<PunctuationMark>() { new PunctuationMark(",", PunctuationMarkType.CommaMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark(".", PunctuationMarkType.AbreviationMark, SingleTextElementInnerOption.None), new PunctuationMark(".", PunctuationMarkType.AbreviationMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark(";", PunctuationMarkType.EnumerationMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark(":", PunctuationMarkType.GeneralizationMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("-", PunctuationMarkType.ConnectionMark, SingleTextElementInnerOption.None), new PunctuationMark("-", PunctuationMarkType.IllustrationMark, SingleTextElementInnerOption.BothSpace), new PunctuationMark("(", PunctuationMarkType.AdditionalMark, SingleTextElementInnerOption.LeftSpace), new PunctuationMark(")", PunctuationMarkType.AdditionalMark, SingleTextElementInnerOption.RightSpace), new PunctuationMark("\"", PunctuationMarkType.CitationMark, SingleTextElementInnerOption.LeftSpace), new PunctuationMark("\"", PunctuationMarkType.CitationMark, SingleTextElementInnerOption.RightSpace) }; }