示例#1
0
        /// <summary>
        /// Returns the next spot that should be searched for a sentence. The returned spot depends on the current index and the chosen mode.
        /// This method is used solely by the Tokenize method.
        /// </summary>
        /// <param name="CurrentPosition">Current index that was being searched</param>
        /// <param name="ContentLength">How long the content is</param>
        /// <param name="Mode">How the content should be searched</param>
        /// <param name="MaximumNumberOfSentences">The maximum amount of sentences</param>
        /// <returns></returns>
        private static int GetNextTextSplitterIndex(int CurrentPosition, int ContentLength, TextSplitterModes Mode, int MaximumNumberOfSentences)
        {
            switch (Mode)
            {
            case TextSplitterModes.EquallySplit:

                int Split = ContentLength / MaximumNumberOfSentences;
                int CurrentJump;
                if (Split != 0)
                {
                    CurrentJump = CurrentPosition / Split;
                    return((CurrentJump + 1) * Split);
                }
                else
                {
                    return(CurrentPosition);
                }

            case TextSplitterModes.InOrder:

                return(CurrentPosition);

            case TextSplitterModes.Random:

                Random r = new Random(Guid.NewGuid().GetHashCode());
                return(r.Next(0, ContentLength - 10));
            }

            return(0);
        }
示例#2
0
        /// <summary>
        /// Tokenizes the content into words and sentences.
        /// </summary>
        /// <param name="Content"></param>
        /// <param name="MaximumNumberOfSentences">The maximum number of sentences that should be returned</param>
        /// <param name="MaximumWordCount">The maximum number of words per sentence. If set to null there is no maximum.</param>
        /// <param name="MinimumWordCount">The minimum number of words per sentence. If set to null there is no minimum</param>
        /// <param name="Mode">The mode of traversing the content that should be used. By default the content is searched in sequential order.</param>
        /// <param name="IsCaseSensitive">If set to false all words will be converted to lower case.</param>
        /// <returns></returns>
        public static List <Sentence> Tokenize(string Content, int MaximumNumberOfSentences,
                                               int?MaximumWordCount = 9, int?MinimumWordCount = 4, TextSplitterModes Mode = TextSplitterModes.InOrder,
                                               bool IsCaseSensitive = false)
        {
            //Initialize Variables
            List <Sentence> Sentences = new List <Sentence>();

            if (Content == null || Content.Length == 0)
            {
                return(new List <Sentence>());
            }
            int ContentLength = Content.Length;

            int      CurrentPosition = 0;
            Sentence CurrentSentence = new Sentence();
            Word     CurrentWord     = new Word();

            char CurrentCharacter;

            // While the number of sentence is less than the maximum and the mode is random or a different mode and not at teh end of the content.
            while (
                ((Mode == TextSplitterModes.Random) || ((Mode == TextSplitterModes.EquallySplit || Mode == TextSplitterModes.InOrder) && CurrentPosition < ContentLength)) &&
                Sentences.Count < MaximumNumberOfSentences
                )
            {
                // Grab the character and conver to lowercase if needed.
                CurrentCharacter = Content[CurrentPosition];
                if (!IsCaseSensitive)
                {
                    CurrentCharacter = char.ToLower(CurrentCharacter);
                }

                // If the current position denotes the end of the sentence
                #region End Of Sentence
                if (AtEndOfSentence(CurrentCharacter))
                {
                    CurrentSentence.Words.Add(CurrentWord);
                    CurrentSentence.Words.Add(new Word(CurrentCharacter.ToString()));

                    // If the current sentence fits the criteria for sentences add it to the list of sentences.
                    if (((MinimumWordCount == null && CurrentSentence.WordCount > 0) || CurrentSentence.WordCount > MinimumWordCount) &&
                        (MaximumWordCount == null || CurrentSentence.WordCount < MaximumWordCount))
                    {
                        Sentences.Add(CurrentSentence);
                        if (Sentences.Count > MaximumNumberOfSentences)
                        {
                            break;
                        }
                        CurrentPosition = GetNextTextSplitterIndex(CurrentPosition, ContentLength, Mode, MaximumNumberOfSentences);
                    }

                    // If not ignore it and move onto the next sentence
                    CurrentSentence = new Sentence();
                    CurrentWord     = new Word();
                }
                #endregion End Of Sentence

                //If the current position is a space
                #region Space
                // If the current position is a tab, space, or newline go to the next word.
                else if (CurrentCharacter == ' ' || CurrentCharacter == '\n' || CurrentCharacter == '\r' || CurrentCharacter == '\t')
                {
                    if (CurrentWord != null && CurrentWord.Text.Length > 0)
                    {
                        CurrentSentence.Words.Add(CurrentWord);
                        CurrentWord = new Word();
                    }
                }
                #endregion Space

                // If the current position is a special character, add it as a seperate word.
                #region Special Characters
                else if (CurrentCharacter == ';' ||
                         CurrentCharacter == '(' ||
                         CurrentCharacter == ')' ||
                         CurrentCharacter == '$' ||
                         CurrentCharacter == '@' ||
                         CurrentCharacter == '#' ||
                         CurrentCharacter == '%' ||
                         CurrentCharacter == '+' ||
                         CurrentCharacter == '-' ||
                         CurrentCharacter == '/' ||
                         CurrentCharacter == '"' ||
                         CurrentCharacter == ',' ||
                         CurrentCharacter == '=' ||
                         CurrentCharacter == '|')
                {
                    if (CurrentWord != null && CurrentWord.Text.Length > 0)
                    {
                        CurrentSentence.Words.Add(CurrentWord);
                        CurrentSentence.Words.Add(new Word(CurrentCharacter.ToString()));
                        CurrentWord = new Word();
                    }
                }
                #endregion Special Characters

                // If the current character is a single quote, treat it as the start of a new word.
                #region Single Quote
                else if (CurrentCharacter == '\'')
                {
                    if (CurrentWord != null && CurrentWord.Text.Length > 0)
                    {
                        CurrentSentence.Words.Add(CurrentWord);
                    }
                    CurrentWord = new Word(CurrentCharacter.ToString());
                }
                #endregion Single Quote

                // If it is any other character, add it to the current word.
                #region All Other Characters
                else
                {
                    CurrentWord.Text += CurrentCharacter;
                }
                #endregion All Other Characters

                CurrentPosition++;

                // If the end of the content has been reached, progress based on the mode.
                #region If At End Of Content
                if (CurrentPosition >= ContentLength)
                {
                    // If the current sentence has the mimum amount of words, add it as a new sentence
                    if ((MinimumWordCount == null && CurrentSentence.WordCount > 0) || CurrentSentence.WordCount > MinimumWordCount)
                    {
                        Sentences.Add(CurrentSentence);
                        if (Sentences.Count > MaximumNumberOfSentences)
                        {
                            break;
                        }
                    }

                    // If the mode is Equally Split or In Order quit
                    if (Mode == TextSplitterModes.EquallySplit || Mode == TextSplitterModes.InOrder)
                    {
                        break;
                    }
                    // If its random keep going
                    else if (Mode == TextSplitterModes.Random)
                    {
                        CurrentSentence = new Sentence();
                        CurrentPosition = GetNextTextSplitterIndex(CurrentPosition, ContentLength, Mode, MaximumNumberOfSentences);
                    }
                }
                #endregion If At End Of Content
            }


            return(Sentences);
        }