/// <summary>
        /// Return tokenized version of a string.
        /// </summary>
        /// <param name="word">word to tokenize</param>
        /// <returns>tokenized version of a string</returns>
        public Collection <string> Tokenize(string word)
        {
            Collection <string> returnVect = new Collection <string>();

            if (word != null)
            {
                int nextGapPos;
                for (int curPos = 0; curPos < word.Length; curPos = nextGapPos)
                {
                    char ch = word[curPos];
                    if (Char.IsWhiteSpace(ch))
                    {
                        curPos++;
                    }
                    nextGapPos = word.Length;
                    for (int i = 0; i < delimiters.Length; i++)
                    {
                        int testPos = word.IndexOf(delimiters[i], curPos);
                        if (testPos < nextGapPos && testPos != -1)
                        {
                            nextGapPos = testPos;
                        }
                    }

                    string term = word.Substring(curPos, (nextGapPos) - (curPos));
                    if (!stopWordHandler.IsWord(term))
                    {
                        returnVect.Add(term);
                    }
                }
            }
            return(returnVect);
        }
예제 #2
0
        /// <summary>
        /// full version of Tokenise which allows for different token lengths
        /// as well as the characterCombinationIndexValue error level as well.
        /// </summary>
        /// <param name="word">word to tokenise</param>
        /// <param name="extended">whether to generate extended tokens</param>
        /// <param name="tokenLength">length of tokens</param>
        /// <param name="characterCombinationIndexValue">error level for skip tokens</param>
        /// <returns>collection of tokens</returns>
        public Collection <string> Tokenize(string word, bool extended, int tokenLength, int characterCombinationIndexValue)
        {
            if (!String.IsNullOrEmpty(word))
            {
                SuppliedWord = word;
                Collection <string> anArray = new Collection <string>();
                int wordLength = word.Length;
                int maxValue   = 0;
                if (tokenLength > 0)
                {
                    maxValue = (tokenLength - 1);
                }
                StringBuilder testword = new StringBuilder(wordLength + (2 * maxValue));
                if (extended)
                {
                    testword.Insert(0, defaultStartPadCharacter, maxValue);
                }
                testword.Append(word);
                if (extended)
                {
                    testword.Insert(testword.Length, defaultEndPadCharacter, maxValue);
                }

                // normal n-gram keys characterCombinationIndex = 0
                string testWordOne = testword.ToString();
                int    maxLoop;
                if (extended)
                {
                    maxLoop = wordLength + maxValue;
                }
                else
                {
                    maxLoop = wordLength - tokenLength + 1;
                }
                for (int i = 0; i < maxLoop; i++)
                {
                    string testWord = testWordOne.Substring(i, tokenLength);
                    if (!stopWordHandler.IsWord(testWord))
                    {
                        anArray.Add(testWord);
                    }
                }

                if (characterCombinationIndexValue != 0)
                {
                    // special characterCombinationIndex n-gram keys
                    testWordOne = testword.ToString();
                    maxLoop    -= 1; // have to reduce by 1 as we are skipping a letter
                    for (int i = 0; i < maxLoop; i++)
                    {
                        string testWord = testWordOne.Substring(i, maxValue) + testWordOne.Substring(i + tokenLength, 1);
                        if (!stopWordHandler.IsWord(testWord))
                        {
                            if (!anArray.Contains(testWord))
                            {
                                anArray.Add(testWord);
                            }
                        }
                    }
                }
                return(anArray);
            }
            return(null);
        }