/// <summary> /// 先将文本分成多个句子,然后对每个句子分词成一个词列表 /// </summary> /// <param name="text"></param> /// <returns></returns> public List <List <Term> > Seg2Sentence(string text) { var termsList = new List <List <Term> >(); foreach (var sentence in SentenceUtil.ToSentenceList(text)) { termsList.Add(SegSentence(sentence.ToCharArray())); } return(termsList); }
/// <summary> /// 可能会多线程分词 /// </summary> /// <param name="text"></param> /// <returns></returns> public List <Term> Seg(string text) { var charArr = text.ToCharArray(); if (Config.NormalizeChar) { CharTable.Normallize(charArr); } if (config.threadNum > 1 && charArr.Length > 10000) // 对长文本才使用多线程 { var sentences = SentenceUtil.ToSentenceList(charArr); var sentenceArr = sentences.ToArray(); var termListArr = new List <Term> [sentenceArr.Length]; var per = sentenceArr.Length / config.threadNum; // 每个线程至少要处理的句子数量 var threads = new Thread[config.threadNum]; var lastThreadIdx = config.threadNum - 1; var sac = new SegAuxClass(sentenceArr, termListArr); for (int i = 0; i < lastThreadIdx; i++) { int from = i * per; sac.from = from; sac.to = from + per; threads[i] = new Thread(ThreadSeg); threads[i].Start(sac); } sac.from = lastThreadIdx * per; sac.to = sentenceArr.Length; threads[lastThreadIdx] = new Thread(ThreadSeg); threads[lastThreadIdx].Start(sac); try { foreach (var thread in threads) { thread.Join(); } } catch (ThreadInterruptedException e) { // log warning "thread sync exception" return(new List <Term>()); } var termList = new List <Term>(); if (config.offset || config.indexMode) { int sentenceOffset = 0; for (int i = 0; i < sentenceArr.Length; i++) { foreach (var term in termListArr[i]) { term.offset += sentenceOffset; termList.Add(term); } sentenceOffset += sentenceArr[i].Length; } } else { foreach (var list in termListArr) { termList.AddRange(list); } } return(termList); } // 单线程分词 return(SegSentence(charArr)); }