コード例 #1
0
ファイル: Segment.cs プロジェクト: gaoshoufenmu/HanLP.csharp
        /// <summary>
        /// 先将文本分成多个句子,然后对每个句子分词成一个词列表
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public List <List <Term> > Seg2Sentence(string text)
        {
            var termsList = new List <List <Term> >();

            foreach (var sentence in SentenceUtil.ToSentenceList(text))
            {
                termsList.Add(SegSentence(sentence.ToCharArray()));
            }

            return(termsList);
        }
コード例 #2
0
ファイル: Segment.cs プロジェクト: gaoshoufenmu/HanLP.csharp
        /// <summary>
        /// 可能会多线程分词
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public List <Term> Seg(string text)
        {
            var charArr = text.ToCharArray();

            if (Config.NormalizeChar)
            {
                CharTable.Normallize(charArr);
            }

            if (config.threadNum > 1 && charArr.Length > 10000)      // 对长文本才使用多线程
            {
                var sentences   = SentenceUtil.ToSentenceList(charArr);
                var sentenceArr = sentences.ToArray();

                var termListArr = new List <Term> [sentenceArr.Length];
                var per         = sentenceArr.Length / config.threadNum; // 每个线程至少要处理的句子数量

                var threads       = new Thread[config.threadNum];
                var lastThreadIdx = config.threadNum - 1;
                var sac           = new SegAuxClass(sentenceArr, termListArr);
                for (int i = 0; i < lastThreadIdx; i++)
                {
                    int from = i * per;
                    sac.from   = from;
                    sac.to     = from + per;
                    threads[i] = new Thread(ThreadSeg);
                    threads[i].Start(sac);
                }
                sac.from = lastThreadIdx * per;
                sac.to   = sentenceArr.Length;
                threads[lastThreadIdx] = new Thread(ThreadSeg);
                threads[lastThreadIdx].Start(sac);

                try
                {
                    foreach (var thread in threads)
                    {
                        thread.Join();
                    }
                }
                catch (ThreadInterruptedException e)
                {
                    // log warning "thread sync exception"
                    return(new List <Term>());
                }

                var termList = new List <Term>();
                if (config.offset || config.indexMode)
                {
                    int sentenceOffset = 0;
                    for (int i = 0; i < sentenceArr.Length; i++)
                    {
                        foreach (var term in termListArr[i])
                        {
                            term.offset += sentenceOffset;
                            termList.Add(term);
                        }
                        sentenceOffset += sentenceArr[i].Length;
                    }
                }
                else
                {
                    foreach (var list in termListArr)
                    {
                        termList.AddRange(list);
                    }
                }
                return(termList);
            }

            // 单线程分词
            return(SegSentence(charArr));
        }