internal IKTokenizer(TextReader in_Renamed, bool mircoSupported) : base(in_Renamed) { dict = Dictionary.load(); lastHitState = 0; segmentBuff = new char[2048]; this.mircoSupported = mircoSupported; tokens = new ExtendOrderedSet <TokenDelegate>(); numberSet = new ExtendOrderedSet <TokenDelegate>(); initContext(); }
private void getTokens(int segLength) { initContext(); char current = '\x0000'; for (int ci = 0; ci < segLength; ci++) { nextContextStatus = 0; nextNumberStatus = 0; segmentBuff[ci] = toDBC(segmentBuff[ci]);//全角转半角 current = segmentBuff[ci]; inputStatus = dict.identify(current); if (contextStatus == 0) { procInitState(ci); } else { if ((contextStatus & 1) > 0) { ci = procLetterState(ci); } if ((contextStatus & 0x10) > 0) { ci = procNumberState(ci); } if ((contextStatus & 0x100) > 0) { ci = procCJKState(ci); } } contextStatus = nextContextStatus; numberStatus = nextNumberStatus; if (nextContextStatus != 0 || segLength - ci <= 1 || segLength - ci >= 50 || segLength != 2048) { continue; } bAr_offset = ci + 1; breakAndRead = true; break; } if (!breakAndRead) { if (numberBeginIndex >= 0) { pushNumber(numberBeginIndex, segLength - 1); } if (lastMatchEnd != segLength - 1) { if (unmatchFlag) { procSplitSeg(unmatchBegin, segLength - 1); } else if (beginIndex < segLength) { procSplitSeg(beginIndex, segLength - 1); } } } for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current)) { ; } numberSet.Clear(); if (!mircoSupported) { ExtendOrderedSet <TokenDelegate> tmpTokens = new ExtendOrderedSet <TokenDelegate>(); for (System.Collections.IEnumerator it = tokens.GetEnumerator(); it.MoveNext();) { TokenDelegate td = (TokenDelegate)it.Current; MTokenDelegate mtd = new MTokenDelegate(td.Offset, td.Begin, td.End); String w = (new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1)).ToLower(); if (!dict.isUselessWord(w)) { mtd.Term = w; tmpTokens.Add(mtd); } } tokens.Clear(); tokens = tmpTokens; } if (breakAndRead) { Array.Copy(segmentBuff, bAr_offset, segmentBuff, 0, segLength - bAr_offset); } }