public virtual TokenPositionAnalysis getTokenAnalysis(int[] features, int[] featuresForAlign, int tokenIndexInStream, int injectNL_WS, int alignOrIndent, bool collectAnalysis) { CommonToken curToken = (CommonToken)originalDoc.tokens.Get(tokenIndexInStream); TerminalNode nodeWithOriginalToken = originalTokenToNodeMap[curToken]; int actualWS = Trainer.getInjectWSCategory(originalTokens, tokenIndexInStream); string actualWSNL = getWSCategoryStr(actualWS); actualWSNL = !string.ReferenceEquals(actualWSNL, null) ? actualWSNL : string.Format("{0,8}", "none"); string wsDisplay = getWSCategoryStr(injectNL_WS); if (string.ReferenceEquals(wsDisplay, null)) { wsDisplay = string.Format("{0,8}", "none"); } string alignDisplay = getHPosCategoryStr(alignOrIndent); if (string.ReferenceEquals(alignDisplay, null)) { alignDisplay = string.Format("{0,8}", "none"); } string newlinePredictionString = string.Format("### line {0:D}: predicted {1} actual {2}", curToken.Line, wsDisplay, actualWSNL); int actualAlignCategory = Trainer.getAlignmentCategory(originalDoc, nodeWithOriginalToken, indentSize); string actualAlignDisplay = getHPosCategoryStr(actualAlignCategory); actualAlignDisplay = !string.ReferenceEquals(actualAlignDisplay, null) ? actualAlignDisplay : string.Format("{0,8}", "none"); string alignPredictionString = string.Format("### line {0:D}: predicted {1} actual {2}", curToken.Line, alignDisplay, actualAlignDisplay); string newlineAnalysis = ""; string alignAnalysis = ""; if (collectAnalysis) { // this can be slow newlineAnalysis = newlinePredictionString + "\n" + wsClassifier.getPredictionAnalysis(testDoc, k, features, corpus.injectWhitespace, Trainer.MAX_WS_CONTEXT_DIFF_THRESHOLD); if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_NL) { alignAnalysis = alignPredictionString + "\n" + hposClassifier.getPredictionAnalysis(testDoc, k, featuresForAlign, corpus.hpos, Trainer.MAX_ALIGN_CONTEXT_DIFF_THRESHOLD); } } TokenPositionAnalysis a = new TokenPositionAnalysis(curToken, injectNL_WS, newlineAnalysis, alignOrIndent, alignAnalysis); a.actualWS = Trainer.getInjectWSCategory(originalTokens, tokenIndexInStream); a.actualAlign = actualAlignCategory; return(a); }
public virtual void processToken(int indexIntoRealTokens, int tokenIndexInStream, bool collectAnalysis) { CommonToken curToken = (CommonToken)testDoc.tokens.Get(tokenIndexInStream); string tokText = curToken.Text; TerminalNode node = tokenToNodeMap[curToken]; int[] features = getFeatures(testDoc, tokenIndexInStream); int[] featuresForAlign = new int[features.Length]; Array.Copy(features, 0, featuresForAlign, 0, features.Length); int injectNL_WS = wsClassifier.classify(k, features, Trainer.MAX_WS_CONTEXT_DIFF_THRESHOLD); injectNL_WS = emitCommentsToTheLeft(tokenIndexInStream, injectNL_WS); int newlines = 0; int ws = 0; if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_NL) { newlines = Trainer.unnlcat(injectNL_WS); } else if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_WS) { ws = Trainer.unwscat(injectNL_WS); } if (newlines == 0 && ws == 0 && cannotJoin(realTokens[indexIntoRealTokens - 1], curToken)) { // failsafe! ws = 1; } int alignOrIndent = Trainer.CAT_ALIGN; if (newlines > 0) { output.Append(Tool.newlines(newlines)); line += newlines; charPosInLine = 0; // getFeatures() doesn't know what line curToken is on. If \n, we need to find exemplars that start a line featuresForAlign[Trainer.INDEX_FIRST_ON_LINE] = 1; // use \n prediction to match exemplars for alignment alignOrIndent = hposClassifier.classify(k, featuresForAlign, Trainer.MAX_ALIGN_CONTEXT_DIFF_THRESHOLD); if ((alignOrIndent & 0xFF) == Trainer.CAT_ALIGN_WITH_ANCESTOR_CHILD) { align(alignOrIndent, node); } else if ((alignOrIndent & 0xFF) == Trainer.CAT_INDENT_FROM_ANCESTOR_CHILD) { indent(alignOrIndent, node); } else if ((alignOrIndent & 0xFF) == Trainer.CAT_ALIGN) { IList <Token> tokensOnPreviousLine = Trainer.getTokensOnPreviousLine(testDoc.tokens, tokenIndexInStream, line); if (tokensOnPreviousLine.Count > 0) { Token firstTokenOnPrevLine = tokensOnPreviousLine[0]; int indentCol = firstTokenOnPrevLine.Column; charPosInLine = indentCol; output.Append(Tool.spaces(indentCol)); } } else if ((alignOrIndent & 0xFF) == Trainer.CAT_INDENT) { indent(alignOrIndent, node); } } else { // inject whitespace instead of \n? output.Append(Tool.spaces(ws)); charPosInLine += ws; } // update Token object with position information now that we are about // to emit it. curToken.Line = line; curToken.Column = charPosInLine; TokenPositionAnalysis tokenPositionAnalysis = getTokenAnalysis(features, featuresForAlign, tokenIndexInStream, injectNL_WS, alignOrIndent, collectAnalysis); analysis[tokenIndexInStream] = tokenPositionAnalysis; int n = tokText.Length; tokenPositionAnalysis.charIndexStart = output.Length; tokenPositionAnalysis.charIndexStop = tokenPositionAnalysis.charIndexStart + n - 1; // emit output.Append(tokText); charPosInLine += n; }