private Token newTok(Token orig, int start, int end) { int startOff = orig.startOffset(); int endOff = orig.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. if (orig.termLength() == endOff - startOff) { endOff = startOff + end; startOff += start; } return((Token)orig.clone(orig.termBuffer(), start, (end - start), startOff, endOff)); }
#pragma warning restore 672 // index "a","b","c" as pos0="a", pos1="b", pos2="c", pos2="abc" private void addCombos(List /*<Token>*/ lst, int start, int end, bool generateSubwords, bool catenateSubwords, int posOffset) { if (end - start == 1) { // always generate a word alone, even if generateSubwords=0 because // the catenation of all the subwords *is* the subword. queue.add(lst.get(start)); return; } StringBuilder sb = null; if (catenateSubwords) { sb = new StringBuilder(); } Token firstTok = null; Token tok = null; for (int i = start; i < end; i++) { tok = (Token)lst.get(i); if (catenateSubwords) { if (i == start) { firstTok = tok; } sb.append(tok.termBuffer(), 0, tok.termLength()); } if (generateSubwords) { queue.add(tok); } } if (catenateSubwords) { Token concatTok = new Token(sb.toString(), firstTok.startOffset(), tok.endOffset(), firstTok.type()); // if we indexed some other tokens, then overlap concatTok with the last. // Otherwise, use the value passed in as the position offset. concatTok.setPositionIncrement(generateSubwords == true ? 0 : posOffset); queue.add(concatTok); } }
// use the type of the first char as the type // of the token. private int tokType(Token t) { return(charType(t.termBuffer()[0])); }