/// <summary> /// Tokenizes the content into words and sentences. /// </summary> /// <param name="Content"></param> /// <param name="MaximumNumberOfSentences">The maximum number of sentences that should be returned</param> /// <param name="MaximumWordCount">The maximum number of words per sentence. If set to null there is no maximum.</param> /// <param name="MinimumWordCount">The minimum number of words per sentence. If set to null there is no minimum</param> /// <param name="Mode">The mode of traversing the content that should be used. By default the content is searched in sequential order.</param> /// <param name="IsCaseSensitive">If set to false all words will be converted to lower case.</param> /// <returns></returns> public static List <Sentence> Tokenize(string Content, int MaximumNumberOfSentences, int?MaximumWordCount = 9, int?MinimumWordCount = 4, TextSplitterModes Mode = TextSplitterModes.InOrder, bool IsCaseSensitive = false) { //Initialize Variables List <Sentence> Sentences = new List <Sentence>(); if (Content == null || Content.Length == 0) { return(new List <Sentence>()); } int ContentLength = Content.Length; int CurrentPosition = 0; Sentence CurrentSentence = new Sentence(); Word CurrentWord = new Word(); char CurrentCharacter; // While the number of sentence is less than the maximum and the mode is random or a different mode and not at teh end of the content. while ( ((Mode == TextSplitterModes.Random) || ((Mode == TextSplitterModes.EquallySplit || Mode == TextSplitterModes.InOrder) && CurrentPosition < ContentLength)) && Sentences.Count < MaximumNumberOfSentences ) { // Grab the character and conver to lowercase if needed. CurrentCharacter = Content[CurrentPosition]; if (!IsCaseSensitive) { CurrentCharacter = char.ToLower(CurrentCharacter); } // If the current position denotes the end of the sentence #region End Of Sentence if (AtEndOfSentence(CurrentCharacter)) { CurrentSentence.Words.Add(CurrentWord); CurrentSentence.Words.Add(new Word(CurrentCharacter.ToString())); // If the current sentence fits the criteria for sentences add it to the list of sentences. if (((MinimumWordCount == null && CurrentSentence.WordCount > 0) || CurrentSentence.WordCount > MinimumWordCount) && (MaximumWordCount == null || CurrentSentence.WordCount < MaximumWordCount)) { Sentences.Add(CurrentSentence); if (Sentences.Count > MaximumNumberOfSentences) { break; } CurrentPosition = GetNextTextSplitterIndex(CurrentPosition, ContentLength, Mode, MaximumNumberOfSentences); } // If not ignore it and move onto the next sentence CurrentSentence = new Sentence(); CurrentWord = new Word(); } #endregion End Of Sentence //If the current position is a space #region Space // If the current position is a tab, space, or newline go to the next word. else if (CurrentCharacter == ' ' || CurrentCharacter == '\n' || CurrentCharacter == '\r' || CurrentCharacter == '\t') { if (CurrentWord != null && CurrentWord.Text.Length > 0) { CurrentSentence.Words.Add(CurrentWord); CurrentWord = new Word(); } } #endregion Space // If the current position is a special character, add it as a seperate word. #region Special Characters else if (CurrentCharacter == ';' || CurrentCharacter == '(' || CurrentCharacter == ')' || CurrentCharacter == '$' || CurrentCharacter == '@' || CurrentCharacter == '#' || CurrentCharacter == '%' || CurrentCharacter == '+' || CurrentCharacter == '-' || CurrentCharacter == '/' || CurrentCharacter == '"' || CurrentCharacter == ',' || CurrentCharacter == '=' || CurrentCharacter == '|') { if (CurrentWord != null && CurrentWord.Text.Length > 0) { CurrentSentence.Words.Add(CurrentWord); CurrentSentence.Words.Add(new Word(CurrentCharacter.ToString())); CurrentWord = new Word(); } } #endregion Special Characters // If the current character is a single quote, treat it as the start of a new word. #region Single Quote else if (CurrentCharacter == '\'') { if (CurrentWord != null && CurrentWord.Text.Length > 0) { CurrentSentence.Words.Add(CurrentWord); } CurrentWord = new Word(CurrentCharacter.ToString()); } #endregion Single Quote // If it is any other character, add it to the current word. #region All Other Characters else { CurrentWord.Text += CurrentCharacter; } #endregion All Other Characters CurrentPosition++; // If the end of the content has been reached, progress based on the mode. #region If At End Of Content if (CurrentPosition >= ContentLength) { // If the current sentence has the mimum amount of words, add it as a new sentence if ((MinimumWordCount == null && CurrentSentence.WordCount > 0) || CurrentSentence.WordCount > MinimumWordCount) { Sentences.Add(CurrentSentence); if (Sentences.Count > MaximumNumberOfSentences) { break; } } // If the mode is Equally Split or In Order quit if (Mode == TextSplitterModes.EquallySplit || Mode == TextSplitterModes.InOrder) { break; } // If its random keep going else if (Mode == TextSplitterModes.Random) { CurrentSentence = new Sentence(); CurrentPosition = GetNextTextSplitterIndex(CurrentPosition, ContentLength, Mode, MaximumNumberOfSentences); } } #endregion If At End Of Content } return(Sentences); }