예제 #1
0
        /**
         * 分词断句 输出句子形式
         *
         * @param text 待分词句子
         * @return 句子列表,每个句子由一个单词列表组成
         */
        public List <List <Term> > seg2sentence(String text)
        {
            List <List <Term> > resultList = new List <List <Term> >();

            {
                foreach (String sentence in SentencesUtil.toSentenceList(text))
                {
                    resultList.Add(segSentence(sentence.ToCharArray()));
                }
            }

            return(resultList);
        }
예제 #2
0
        /**
         * 分词<br>
         * 此方法是线程安全的
         *
         * @param text 待分词文本
         * @return 单词列表
         */
        public List <Term> seg(String text)
        {
            char[] charArray = text.ToCharArray();
            if (HanLP.Config.Normalization)
            {
                CharTable.normalization(charArray);
            }
            if (config.threadNumber > 1 && charArray.Length > 10000)    // 小文本多线程没意义,反而变慢了
            {
                List <String> sentenceList  = SentencesUtil.toSentenceList(charArray);
                String[]      sentenceArray = sentenceList.ToArray();
                //noinspection unchecked
                List <Term>[] termListArray = new List <Term> [sentenceArray.Length];
                int           per           = sentenceArray.Length / config.threadNumber;
                WorkThread[]  threadArray   = new WorkThread[config.threadNumber];
                for (int i = 0; i < config.threadNumber - 1; ++i)
                {
                    int from = i * per;
                    //threadArray[i] = new WorkThread(sentenceArray, termListArray, from, from + per);
                    //threadArray[i].start();
                }
                threadArray[config.threadNumber - 1] = new WorkThread(sentenceArray, termListArray, (config.threadNumber - 1) * per, sentenceArray.Length);
                //threadArray[config.threadNumber - 1].start();
                try
                {
                    foreach (WorkThread thread in threadArray)
                    {
                        //thread.join();
                    }
                }
                catch (Exception e)
                {
                    //logger.severe("线程同步异常:" + TextUtility.exceptionToString(e));
                    //return Collections.emptyList();
                }
                List <Term> termList = new List <Term>();
                if (config.offset || config.indexMode)  // 由于分割了句子,所以需要重新校正offset
                {
                    int sentenceOffset = 0;
                    for (int i = 0; i < sentenceArray.Length; ++i)
                    {
                        foreach (Term term in termListArray[i])
                        {
                            term.offset += sentenceOffset;
                            termList.Add(term);
                        }
                        sentenceOffset += sentenceArray[i].Length;
                    }
                }
                else
                {
                    foreach (List <Term> list in termListArray)
                    {
                        termList.AddRange(list);
                    }
                }

                return(termList);
            }
            //        if (text.length() > 10000)  // 针对大文本,先拆成句子,后分词,避免内存峰值太大
            //        {
            //            List<Term> termList = new LinkedList<Term>();
            //            if (config.offset || config.indexMode)
            //            {
            //                int sentenceOffset = 0;
            //                for (String sentence : SentencesUtil.toSentenceList(charArray))
            //                {
            //                    List<Term> termOfSentence = segSentence(sentence.toCharArray());
            //                    for (Term term : termOfSentence)
            //                    {
            //                        term.offset += sentenceOffset;
            //                        termList.add(term);
            //                    }
            //                    sentenceOffset += sentence.length();
            //                }
            //            }
            //            else
            //            {
            //                for (String sentence : SentencesUtil.toSentenceList(charArray))
            //                {
            //                    termList.addAll(segSentence(sentence.toCharArray()));
            //                }
            //            }
            //
            //            return termList;
            //        }
            return(segSentence(charArray));
        }