示例#1
0
        public virtual TokenPositionAnalysis getTokenAnalysis(int[] features, int[] featuresForAlign, int tokenIndexInStream, int injectNL_WS, int alignOrIndent, bool collectAnalysis)
        {
            CommonToken  curToken = (CommonToken)originalDoc.tokens.Get(tokenIndexInStream);
            TerminalNode nodeWithOriginalToken = originalTokenToNodeMap[curToken];

            int    actualWS   = Trainer.getInjectWSCategory(originalTokens, tokenIndexInStream);
            string actualWSNL = getWSCategoryStr(actualWS);

            actualWSNL = !string.ReferenceEquals(actualWSNL, null) ? actualWSNL : string.Format("{0,8}", "none");

            string wsDisplay = getWSCategoryStr(injectNL_WS);

            if (string.ReferenceEquals(wsDisplay, null))
            {
                wsDisplay = string.Format("{0,8}", "none");
            }
            string alignDisplay = getHPosCategoryStr(alignOrIndent);

            if (string.ReferenceEquals(alignDisplay, null))
            {
                alignDisplay = string.Format("{0,8}", "none");
            }
            string newlinePredictionString = string.Format("### line {0:D}: predicted {1} actual {2}", curToken.Line, wsDisplay, actualWSNL);

            int    actualAlignCategory = Trainer.getAlignmentCategory(originalDoc, nodeWithOriginalToken, indentSize);
            string actualAlignDisplay  = getHPosCategoryStr(actualAlignCategory);

            actualAlignDisplay = !string.ReferenceEquals(actualAlignDisplay, null) ? actualAlignDisplay : string.Format("{0,8}", "none");

            string alignPredictionString = string.Format("### line {0:D}: predicted {1} actual {2}", curToken.Line, alignDisplay, actualAlignDisplay);

            string newlineAnalysis = "";
            string alignAnalysis   = "";

            if (collectAnalysis)
            {             // this can be slow
                newlineAnalysis = newlinePredictionString + "\n" + wsClassifier.getPredictionAnalysis(testDoc, k, features, corpus.injectWhitespace, Trainer.MAX_WS_CONTEXT_DIFF_THRESHOLD);
                if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_NL)
                {
                    alignAnalysis = alignPredictionString + "\n" + hposClassifier.getPredictionAnalysis(testDoc, k, featuresForAlign, corpus.hpos, Trainer.MAX_ALIGN_CONTEXT_DIFF_THRESHOLD);
                }
            }
            TokenPositionAnalysis a = new TokenPositionAnalysis(curToken, injectNL_WS, newlineAnalysis, alignOrIndent, alignAnalysis);

            a.actualWS    = Trainer.getInjectWSCategory(originalTokens, tokenIndexInStream);
            a.actualAlign = actualAlignCategory;
            return(a);
        }
示例#2
0
        public virtual void processToken(int indexIntoRealTokens, int tokenIndexInStream, bool collectAnalysis)
        {
            CommonToken  curToken = (CommonToken)testDoc.tokens.Get(tokenIndexInStream);
            string       tokText  = curToken.Text;
            TerminalNode node     = tokenToNodeMap[curToken];

            int[] features         = getFeatures(testDoc, tokenIndexInStream);
            int[] featuresForAlign = new int[features.Length];
            Array.Copy(features, 0, featuresForAlign, 0, features.Length);

            int injectNL_WS = wsClassifier.classify(k, features, Trainer.MAX_WS_CONTEXT_DIFF_THRESHOLD);

            injectNL_WS = emitCommentsToTheLeft(tokenIndexInStream, injectNL_WS);

            int newlines = 0;
            int ws       = 0;

            if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_NL)
            {
                newlines = Trainer.unnlcat(injectNL_WS);
            }
            else if ((injectNL_WS & 0xFF) == Trainer.CAT_INJECT_WS)
            {
                ws = Trainer.unwscat(injectNL_WS);
            }

            if (newlines == 0 && ws == 0 && cannotJoin(realTokens[indexIntoRealTokens - 1], curToken))
            {             // failsafe!
                ws = 1;
            }

            int alignOrIndent = Trainer.CAT_ALIGN;

            if (newlines > 0)
            {
                output.Append(Tool.newlines(newlines));
                line         += newlines;
                charPosInLine = 0;

                // getFeatures() doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
                featuresForAlign[Trainer.INDEX_FIRST_ON_LINE] = 1;                 // use \n prediction to match exemplars for alignment

                alignOrIndent = hposClassifier.classify(k, featuresForAlign, Trainer.MAX_ALIGN_CONTEXT_DIFF_THRESHOLD);

                if ((alignOrIndent & 0xFF) == Trainer.CAT_ALIGN_WITH_ANCESTOR_CHILD)
                {
                    align(alignOrIndent, node);
                }
                else if ((alignOrIndent & 0xFF) == Trainer.CAT_INDENT_FROM_ANCESTOR_CHILD)
                {
                    indent(alignOrIndent, node);
                }
                else if ((alignOrIndent & 0xFF) == Trainer.CAT_ALIGN)
                {
                    IList <Token> tokensOnPreviousLine = Trainer.getTokensOnPreviousLine(testDoc.tokens, tokenIndexInStream, line);
                    if (tokensOnPreviousLine.Count > 0)
                    {
                        Token firstTokenOnPrevLine = tokensOnPreviousLine[0];
                        int   indentCol            = firstTokenOnPrevLine.Column;
                        charPosInLine = indentCol;
                        output.Append(Tool.spaces(indentCol));
                    }
                }
                else if ((alignOrIndent & 0xFF) == Trainer.CAT_INDENT)
                {
                    indent(alignOrIndent, node);
                }
            }
            else
            {
                // inject whitespace instead of \n?
                output.Append(Tool.spaces(ws));
                charPosInLine += ws;
            }

            // update Token object with position information now that we are about
            // to emit it.
            curToken.Line   = line;
            curToken.Column = charPosInLine;

            TokenPositionAnalysis tokenPositionAnalysis = getTokenAnalysis(features, featuresForAlign, tokenIndexInStream, injectNL_WS, alignOrIndent, collectAnalysis);

            analysis[tokenIndexInStream] = tokenPositionAnalysis;

            int n = tokText.Length;

            tokenPositionAnalysis.charIndexStart = output.Length;
            tokenPositionAnalysis.charIndexStop  = tokenPositionAnalysis.charIndexStart + n - 1;

            // emit
            output.Append(tokText);
            charPosInLine += n;
        }