Exemplo n.º 1
0
        public IEnumerable <Hubble.Core.Entity.WordInfo> Tokenize(string text)
        {
            lock (_InitLockObj)
            {
                if (!_Inited)
                {
                    Init();
                    _Inited = true;
                }
            }

            int start = -1;

            _Tokenes = new List <Hubble.Core.Entity.WordInfo>();
            bool needToLower = false;

            for (int i = 0; i < text.Length; i++)
            {
                char c = text[i];

                if (_NGramChar[c] == Int16.MaxValue)
                {
                    Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo();
                    wordinfo.Word     = c.ToString();
                    wordinfo.Rank     = 1;
                    wordinfo.Position = i;
                    start             = -1;
                    _Tokenes.Add(wordinfo);
                }
                else if (_CharSetTable[c] == Int16.MaxValue)
                {
                    if (start < 0)
                    {
                        start = i;
                    }
                }
                else if (_CharSetTable[c] == 0)
                {
                    if (start >= 0)
                    {
                        Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo();
                        wordinfo.Word = text.Substring(start, i - start);

                        if (needToLower)
                        {
                            wordinfo.Word = wordinfo.Word.ToLower();
                        }

                        wordinfo.Rank     = 1;
                        wordinfo.Position = start;
                        start             = -1;
                        needToLower       = false;
                        _Tokenes.Add(wordinfo);
                    }
                }
                else
                {
                    if (start < 0)
                    {
                        start = i;
                    }

                    needToLower = true;
                }
            }

            if (start >= 0)
            {
                Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo();
                wordinfo.Word = text.Substring(start, text.Length - start);

                if (needToLower)
                {
                    wordinfo.Word = wordinfo.Word.ToLower();
                }

                wordinfo.Rank     = 1;
                wordinfo.Position = start;
                start             = -1;
                needToLower       = false;
                _Tokenes.Add(wordinfo);
            }

            return(_Tokenes);

            //_Tokenes = GetInitSegment(text);

            //foreach (Framework.WordInfo wi in _Tokenes)
            //{
            //    if (wi == null)
            //    {
            //        continue;
            //    }

            //    if (wi.Word == null)
            //    {
            //        continue;
            //    }

            //    if (wi.WordType == Hubble.Core.Analysis.Framework.WordType.None ||
            //        wi.WordType == Hubble.Core.Analysis.Framework.WordType.Space)
            //    {
            //        continue;
            //    }

            //    Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo();
            //    wordinfo.Word = wi.Word;
            //    if (_Lowercase)
            //    {
            //        wordinfo.Word = wordinfo.Word.ToLower();
            //    }

            //    wordinfo.Position = wi.Position;
            //    wordinfo.Rank = 1;
            //    yield return wordinfo;
            //}
        }
Exemplo n.º 2
0
        public IEnumerable <Hubble.Core.Entity.WordInfo> Tokenize(string text)
        {
            lock (_InitLockObj)
            {
                if (!_Inited)
                {
                    Init();
                    _Inited = true;
                }
            }

            _Tokenes = GetInitSegment(text);

            foreach (Framework.WordInfo wi in _Tokenes)
            {
                if (wi == null)
                {
                    continue;
                }

                if (wi.Word == null)
                {
                    continue;
                }

                if (wi.WordType == Hubble.Core.Analysis.Framework.WordType.None ||
                    wi.WordType == Hubble.Core.Analysis.Framework.WordType.Space)
                {
                    continue;
                }

                Hubble.Core.Entity.WordInfo wordinfo = new Hubble.Core.Entity.WordInfo();
                wordinfo.Word     = wi.Word;
                wordinfo.Position = wi.Position;
                wordinfo.Rank     = OriginalRank;
                yield return(wordinfo);

                string lower = wordinfo.Word.ToLower();

                if (lower != wordinfo.Word)
                {
                    Hubble.Core.Entity.WordInfo lowerWord = wordinfo;
                    lowerWord.Word = lower;
                    lowerWord.Rank = LowerRank;
                    yield return(lowerWord);
                }

                string stem = GetStem(lower);

                if (!string.IsNullOrEmpty(stem))
                {
                    if (lower != stem)
                    {
                        Hubble.Core.Entity.WordInfo stemWord = wordinfo;
                        stemWord.Word = stem;
                        stemWord.Rank = StemRank;
                        yield return(stemWord);
                    }
                }
            }
        }