예제 #1
0
        public void SpellTest()
        {
            LuceneEngine engine = new LuceneEngine();

            engine.Init();
            var firsts     = SpellUtils.GetCnSegment("NBA常规赛-快船vs凯尔特人");
            var spells     = SpellUtils.GetSpellSegment("战重警和");
            var segment    = new Segment();
            var collection = segment.DoSegment("国际足球100509K联赛釜山-大田");
            var list       = new List <string>();

            foreach (WordInfo word in collection)
            {
                if (word == null)
                {
                    continue;
                }
                list.Add(word.Word);
            }
            //var list = SpellUtils.GetSpellSegment("战警");
            string         sss  = "san国yan义";
            string         ssss = Synacast.LuceneNetSearcher.Searcher.Searcher.SegmentKeyWord(sss);
            string         text = "重庆";
            TextReader     tr   = new StringReader(text);
            PanGuTokenizer ct   = new PanGuTokenizer(tr);
            int            end  = 0;

            Lucene.Net.Analysis.Token t;
            string ss = string.Empty;

            while (end < text.Length)
            {
                t   = ct.Next();
                end = t.EndOffset();
                ss  = ss + t.TermText() + "/ ";
            }
        }
예제 #2
0
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            _inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(_inputText))
            {
                char[]        readBuf  = new char[1024];
                int           relCount = base.input.Read(readBuf, 0, readBuf.Length);
                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0)
                {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }

                if (inputStr.Length > 0)
                {
                    _inputText = inputStr.ToString();
                }
            }

            _isFlag = AnalyzInput();
            if (!_isFlag)       //盘古分词
            {
                global::PanGu.Segment segment = new Segment();
                var wordInfos = segment.DoSegment(_inputText);
                foreach (var wi in wordInfos)
                {
                    var list = SpellUtils.GetSpellSegment(wi.Word);
                    if (list != null)
                    {
                        _segmentList.AddRange(list);
                    }
                }
                _wordList = new List <WordInfo>(wordInfos);
            }
            else
            {
                if (_inputText.EndsWith(_indexFlag))  //逗号空格分词
                {
                    string[] sources = _inputText.Replace(_indexFlag, "").Split(_splitFlag, StringSplitOptions.RemoveEmptyEntries);
                    foreach (string source in sources)
                    {
                        _segmentList.Add(source);
                        var spells = SpellUtils.GetSpellSegment(source);
                        if (spells != null)
                        {
                            _segmentList.AddRange(spells);
                        }
                    }
                }
                else if (_inputText.EndsWith(_indexCnName))  //拼音首字母分词
                {
                    string source = _inputText.Replace(_indexCnName, "");
                    _wordList = SpellUtils.GetCnSegment(source);
                }
                else  //flag分词
                {
                    string source = _inputText.Replace("|", "");
                    for (int i = 0; i < source.Length; i++)
                    {
                        string first = source[i].ToString();
                        _segmentList.Add(first);
                        int f = 1;
                        for (int j = i + 1; j < source.Length; j++)
                        {
                            string s = string.Format("{0}|{1}", first, source[j]);
                            _segmentList.Add(s);
                            first = s;
                            if (j == source.Length - 1)
                            {
                                f++;
                                j     = i + f - 1;
                                first = source[i].ToString();
                            }
                        }
                    }
                }
            }
        }