public IEnumerable <Hubble.Core.Entity.WordInfo> Tokenize(string text) { lock (_InitLockObj) { if (!_Inited) { Init(); _Inited = true; } } int start = -1; _Tokenes = new List <Hubble.Core.Entity.WordInfo>(); bool needToLower = false; for (int i = 0; i < text.Length; i++) { char c = text[i]; if (_NGramChar[c] == Int16.MaxValue) { Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo(); wordinfo.Word = c.ToString(); wordinfo.Rank = 1; wordinfo.Position = i; start = -1; _Tokenes.Add(wordinfo); } else if (_CharSetTable[c] == Int16.MaxValue) { if (start < 0) { start = i; } } else if (_CharSetTable[c] == 0) { if (start >= 0) { Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo(); wordinfo.Word = text.Substring(start, i - start); if (needToLower) { wordinfo.Word = wordinfo.Word.ToLower(); } wordinfo.Rank = 1; wordinfo.Position = start; start = -1; needToLower = false; _Tokenes.Add(wordinfo); } } else { if (start < 0) { start = i; } needToLower = true; } } if (start >= 0) { Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo(); wordinfo.Word = text.Substring(start, text.Length - start); if (needToLower) { wordinfo.Word = wordinfo.Word.ToLower(); } wordinfo.Rank = 1; wordinfo.Position = start; start = -1; needToLower = false; _Tokenes.Add(wordinfo); } return(_Tokenes); //_Tokenes = GetInitSegment(text); //foreach (Framework.WordInfo wi in _Tokenes) //{ // if (wi == null) // { // continue; // } // if (wi.Word == null) // { // continue; // } // if (wi.WordType == Hubble.Core.Analysis.Framework.WordType.None || // wi.WordType == Hubble.Core.Analysis.Framework.WordType.Space) // { // continue; // } // Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo(); // wordinfo.Word = wi.Word; // if (_Lowercase) // { // wordinfo.Word = wordinfo.Word.ToLower(); // } // wordinfo.Position = wi.Position; // wordinfo.Rank = 1; // yield return wordinfo; //} }
public IEnumerable <Hubble.Core.Entity.WordInfo> Tokenize(string text) { lock (_InitLockObj) { if (!_Inited) { Init(); _Inited = true; } } _Tokenes = GetInitSegment(text); foreach (Framework.WordInfo wi in _Tokenes) { if (wi == null) { continue; } if (wi.Word == null) { continue; } if (wi.WordType == Hubble.Core.Analysis.Framework.WordType.None || wi.WordType == Hubble.Core.Analysis.Framework.WordType.Space) { continue; } Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo(); wordinfo.Word = wi.Word; wordinfo.Position = wi.Position; wordinfo.Rank = OriginalRank; yield return(wordinfo); string lower = wordinfo.Word.ToLower(); if (lower != wordinfo.Word) { Hubble.Core.Entity.WordInfo lowerWord = wordinfo; lowerWord.Word = lower; lowerWord.Rank = LowerRank; yield return(lowerWord); } string stem = GetStem(lower); if (!string.IsNullOrEmpty(stem)) { if (lower != stem) { Hubble.Core.Entity.WordInfo stemWord = wordinfo; stemWord.Word = stem; stemWord.Rank = StemRank; yield return(stemWord); } } } }