protected override void Decompose() { // get the hyphenation points Hyphenation.Hyphenation hyphens = hyphenator.Hyphenate(m_termAtt.Buffer, 0, m_termAtt.Length, 1, 1); // No hyphen points found -> exit if (hyphens is null) { return; } int[] hyp = hyphens.HyphenationPoints; for (int i = 0; i < hyp.Length; ++i) { int remaining = hyp.Length - i; int start = hyp[i]; CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; // if the part is longer than maxSubwordSize we // are done with this round if (partLength > this.m_maxSubwordSize) { break; } // we only put subwords to the token stream // that are longer than minPartSize if (partLength < this.m_minSubwordSize) { // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the // calculation above, and we rely upon minSubwordSize being >=0 to filter them out... continue; } // check the dictionary if (m_dictionary is null || m_dictionary.Contains(m_termAtt.Buffer, start, partLength)) { if (this.m_onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.Text.Length < partLength) { longestMatchToken = new CompoundToken(this, start, partLength); } } else { longestMatchToken = new CompoundToken(this, start, partLength); } } else { m_tokens.Enqueue(new CompoundToken(this, start, partLength)); } }
protected override void Decompose() { // get the hyphenation points Hyphenation.Hyphenation hyphens = hyphenator.Hyphenate(termAtt.Buffer(), 0, termAtt.Length, 1, 1); // No hyphen points found -> exit if (hyphens == null) { return; } int[] hyp = hyphens.HyphenationPoints; for (int i = 0; i < hyp.Length; ++i) { int remaining = hyp.Length - i; int start = hyp[i]; CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; // if the part is longer than maxSubwordSize we // are done with this round if (partLength > this.maxSubwordSize) { break; } // we only put subwords to the token stream // that are longer than minPartSize if (partLength < this.minSubwordSize) { // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the // calculation above, and we rely upon minSubwordSize being >=0 to filter them out... continue; } // check the dictionary if (dictionary == null || dictionary.Contains(termAtt.Buffer(), start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.txt.Length < partLength) { longestMatchToken = new CompoundToken(this, start, partLength); } } else { longestMatchToken = new CompoundToken(this, start, partLength); } } else { tokens.AddLast(new CompoundToken(this, start, partLength)); } } else if (dictionary.Contains(termAtt.Buffer(), start, partLength - 1)) { // check the dictionary again with a word that is one character // shorter // to avoid problems with genitive 's characters and other binding // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { if (longestMatchToken.txt.Length < partLength - 1) { longestMatchToken = new CompoundToken(this, start, partLength - 1); } } else { longestMatchToken = new CompoundToken(this, start, partLength - 1); } } else { tokens.AddLast(new CompoundToken(this, start, partLength - 1)); } } } if (this.onlyLongestMatch && longestMatchToken != null) { tokens.AddLast(longestMatchToken); } } }