private void pushTerm(int begin, int end, bool directOutput) { if (begin > end) { return; } if (!directOutput) { Hit h = dict.search(segmentBuff, begin, end); if (!h.isMatch() && begin != end && dict.isNoiseWord(Convert.ToString(segmentBuff[begin]))) { begin++; } if (dict.isNbSign(segmentBuff[end]) || dict.isConnector(segmentBuff[end])) { end--; } } TokenDelegate td = new TokenDelegate(offset, begin, end); if (mircoSupported) { String w = (new String(segmentBuff, begin, (end - begin) + 1)).ToLower(); if (!dict.isUselessWord(w)) { td.Term = w; tokens.Add(td); } } else { tokens.Add(td); } }
public virtual int CompareTo(System.Object o) { TokenDelegate ntd = (TokenDelegate)o; if (begin < ntd.begin) { return(-1); } if (begin == ntd.begin) { if (end > ntd.end) { return(-1); } if (end == ntd.end) { return(0); } if (end < ntd.end) { return(1); } } return(1); }
public override bool Equals(System.Object o) { if (o is TokenDelegate) { TokenDelegate ntd = (TokenDelegate)o; if (begin == ntd.Begin && end == ntd.End) { return(true); } } return(false); }
private void pushNumber(int begin, int end) { if (dict.isNbSign(segmentBuff[end]) || dict.isConnector(segmentBuff[end])) { pushTerm(begin, end - 1); return; } else { TokenDelegate td = new TokenDelegate(offset, begin, end); numberSet.Add(td); return; } }
private void pushTerm(TokenDelegate td) { if (mircoSupported) { String w = new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1); if (!dict.isUselessWord(w)) { td.Term = w; tokens.Add(td); } } else { tokens.Add(td); } }
public override Token Next() { if (!(tokens.Count == 0)) { TokenDelegate td = tokens.GetFirst(); tokens.Remove(td); return(td.Token); } int segLength = 0; if (breakAndRead) { segLength = input.Read(segmentBuff, 2048 - bAr_offset, bAr_offset); segLength += 2048 - bAr_offset; } else { segLength = input.Read(segmentBuff, 0, segmentBuff.Length); } if (segLength <= 0) { return(null); } getTokens(segLength); offset += bAr_offset; if (!(tokens.Count == 0)) { TokenDelegate td = tokens.GetFirst(); tokens.Remove(td); return(td.Token); } else { return(null); } }
private void getTokens(int segLength) { initContext(); char current = '\x0000'; for (int ci = 0; ci < segLength; ci++) { nextContextStatus = 0; nextNumberStatus = 0; segmentBuff[ci] = toDBC(segmentBuff[ci]);//全角转半角 current = segmentBuff[ci]; inputStatus = dict.identify(current); if (contextStatus == 0) { procInitState(ci); } else { if ((contextStatus & 1) > 0) { ci = procLetterState(ci); } if ((contextStatus & 0x10) > 0) { ci = procNumberState(ci); } if ((contextStatus & 0x100) > 0) { ci = procCJKState(ci); } } contextStatus = nextContextStatus; numberStatus = nextNumberStatus; if (nextContextStatus != 0 || segLength - ci <= 1 || segLength - ci >= 50 || segLength != 2048) { continue; } bAr_offset = ci + 1; breakAndRead = true; break; } if (!breakAndRead) { if (numberBeginIndex >= 0) { pushNumber(numberBeginIndex, segLength - 1); } if (lastMatchEnd != segLength - 1) { if (unmatchFlag) { procSplitSeg(unmatchBegin, segLength - 1); } else if (beginIndex < segLength) { procSplitSeg(beginIndex, segLength - 1); } } } for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current)) { ; } numberSet.Clear(); if (!mircoSupported) { ExtendOrderedSet <TokenDelegate> tmpTokens = new ExtendOrderedSet <TokenDelegate>(); for (System.Collections.IEnumerator it = tokens.GetEnumerator(); it.MoveNext();) { TokenDelegate td = (TokenDelegate)it.Current; MTokenDelegate mtd = new MTokenDelegate(td.Offset, td.Begin, td.End); String w = (new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1)).ToLower(); if (!dict.isUselessWord(w)) { mtd.Term = w; tmpTokens.Add(mtd); } } tokens.Clear(); tokens = tmpTokens; } if (breakAndRead) { Array.Copy(segmentBuff, bAr_offset, segmentBuff, 0, segLength - bAr_offset); } }
private int procCJK(int begin, int end) { Hit hit = dict.search(segmentBuff, begin, end); if (hit.MatchAndContinue) { if (unmatchFlag) { if (hit.WordType.Suffix) { pushTerm(unmatchBegin, end); } else if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin]))) { pushTerm(unmatchBegin, begin - 1, true); pushTerm(unmatchBegin, begin, true); } else { cjkCut(unmatchBegin, begin - 1); pushTerm(unmatchBegin, begin - 1); } LastMatchEnd = end; pushTerm(begin, end); unmatchFlag = false; } else if (hit.WordType.NormWord) { LastMatchEnd = end; pushTerm(begin, end); } if (!(numberSet.Count == 0)) { for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current)) { ; } if (hit.WordType.Count) { TokenDelegate number = numberSet.GetLast(); pushTerm(number.Begin, end, true); LastMatchEnd = end; } else { numberSet.Clear(); } } lastHitState = 1; return(0); } if (hit.isMatch()) { if (unmatchFlag) { if (hit.WordType.Suffix) { pushTerm(unmatchBegin, end); } else if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin]))) { pushTerm(unmatchBegin, begin - 1, true); pushTerm(unmatchBegin, begin, true); } else { cjkCut(unmatchBegin, begin - 1); pushTerm(unmatchBegin, begin - 1); } LastMatchEnd = end; pushTerm(begin, end); unmatchFlag = false; } else if (hit.WordType.NormWord) { LastMatchEnd = end; pushTerm(begin, end); } else { unmatchFlag = true; unmatchBegin = begin; } if (!(numberSet.Count == 0)) { for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current)) { ; } if (hit.WordType.Count) { TokenDelegate number = numberSet.GetLast(); pushTerm(number.Begin, end, true); LastMatchEnd = end; unmatchFlag = false; } numberSet.Clear(); } beginIndex++; lastHitState = 2; return(beginIndex - end); } if (hit.isPrefixMatch()) { lastHitState = 3; return(0); } if (hit.isUnmatch()) { if (lastHitState == 3 && unmatchFlag) { if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin]))) { pushTerm(unmatchBegin, begin - 1, true); pushTerm(unmatchBegin, begin, true); } else { cjkCut(unmatchBegin, begin - 1); pushTerm(unmatchBegin, begin - 1); } } if (begin > lastMatchEnd) { if (!unmatchFlag) { unmatchFlag = true; unmatchBegin = begin; } Hit h = dict.search(segmentBuff, end, end); if (dict.isNoiseWord(Convert.ToString(segmentBuff[end])) && end - begin == 1) { cjkCut(unmatchBegin, begin); pushTerm(unmatchBegin, begin); unmatchFlag = false; LastMatchEnd = begin; } else if (h.isMatch() && h.WordType.Suffix) { if (unmatchBegin < end) { pushTerm(unmatchBegin, end - 1); pushTerm(unmatchBegin, end); cjkCut(unmatchBegin, end); unmatchFlag = false; LastMatchEnd = end; beginIndex = end; return(0); } } else { h = dict.search(segmentBuff, begin, begin); if (h.isMatch() && h.WordType.Suffix && unmatchBegin != begin) { pushTerm(unmatchBegin, begin); unmatchFlag = false; LastMatchEnd = begin; } } } beginIndex++; lastHitState = 4; return(beginIndex - end); } else { return(0); } }