/// <summary> /// Return tokenized version of a string. /// </summary> /// <param name="word">word to tokenize</param> /// <returns>tokenized version of a string</returns> public Collection <string> Tokenize(string word) { Collection <string> returnVect = new Collection <string>(); if (word != null) { int nextGapPos; for (int curPos = 0; curPos < word.Length; curPos = nextGapPos) { char ch = word[curPos]; if (Char.IsWhiteSpace(ch)) { curPos++; } nextGapPos = word.Length; for (int i = 0; i < delimiters.Length; i++) { int testPos = word.IndexOf(delimiters[i], curPos); if (testPos < nextGapPos && testPos != -1) { nextGapPos = testPos; } } string term = word.Substring(curPos, (nextGapPos) - (curPos)); if (!stopWordHandler.IsWord(term)) { returnVect.Add(term); } } } return(returnVect); }
/// <summary> /// full version of Tokenise which allows for different token lengths /// as well as the characterCombinationIndexValue error level as well. /// </summary> /// <param name="word">word to tokenise</param> /// <param name="extended">whether to generate extended tokens</param> /// <param name="tokenLength">length of tokens</param> /// <param name="characterCombinationIndexValue">error level for skip tokens</param> /// <returns>collection of tokens</returns> public Collection <string> Tokenize(string word, bool extended, int tokenLength, int characterCombinationIndexValue) { if (!String.IsNullOrEmpty(word)) { SuppliedWord = word; Collection <string> anArray = new Collection <string>(); int wordLength = word.Length; int maxValue = 0; if (tokenLength > 0) { maxValue = (tokenLength - 1); } StringBuilder testword = new StringBuilder(wordLength + (2 * maxValue)); if (extended) { testword.Insert(0, defaultStartPadCharacter, maxValue); } testword.Append(word); if (extended) { testword.Insert(testword.Length, defaultEndPadCharacter, maxValue); } // normal n-gram keys characterCombinationIndex = 0 string testWordOne = testword.ToString(); int maxLoop; if (extended) { maxLoop = wordLength + maxValue; } else { maxLoop = wordLength - tokenLength + 1; } for (int i = 0; i < maxLoop; i++) { string testWord = testWordOne.Substring(i, tokenLength); if (!stopWordHandler.IsWord(testWord)) { anArray.Add(testWord); } } if (characterCombinationIndexValue != 0) { // special characterCombinationIndex n-gram keys testWordOne = testword.ToString(); maxLoop -= 1; // have to reduce by 1 as we are skipping a letter for (int i = 0; i < maxLoop; i++) { string testWord = testWordOne.Substring(i, maxValue) + testWordOne.Substring(i + tokenLength, 1); if (!stopWordHandler.IsWord(testWord)) { if (!anArray.Contains(testWord)) { anArray.Add(testWord); } } } } return(anArray); } return(null); }