Exemple #1
0
        /// <summary>
        /// Format the document. Does not affect/alter doc. </summary>
        public virtual string format(InputDocument doc, bool collectAnalysis)
        {
            if (testDoc != null)
            {
                throw new System.ArgumentException("can't call format > once");
            }
            // for debugging we need a map from original token with actual line:col to tree node. used by token analysis
            originalDoc            = doc;
            originalTokenToNodeMap = Trainer.indexTree(doc.tree);
            originalTokens         = doc.tokens;

            this.testDoc    = InputDocument.dup(doc);          // make copy of doc, getting new tokens, tree
            output          = new StringBuilder();
            this.realTokens = Trainer.getRealTokens(testDoc.tokens);
            // squeeze out ws and kill any line/col info so we can't use ground truth by mistake
            wipeCharPositionInfoAndWhitespaceTokens(testDoc.tokens);             // all except for first token
            wsClassifier   = new kNNClassifier(corpus, wsFeatures, corpus.injectWhitespace);
            hposClassifier = new kNNClassifier(corpus, hposFeatures, corpus.hpos);

            analysis = new ArrayList <TokenPositionAnalysis>(testDoc.tokens.Size);
            for (int i = 0; i < testDoc.tokens.Size; ++i)
            {
                analysis.Add(null);
            }

            // make an index on the duplicated doc tree with tokens missing line:col info
            if (tokenToNodeMap == null)
            {
                tokenToNodeMap = Trainer.indexTree(testDoc.tree);
            }

            IToken firstToken = testDoc.tokens.getNextRealToken(-1);

            string prefix = originalTokens.GetText(Interval.Of(0, firstToken.TokenIndex)); // gets any comments in front + first real token

            charPosInLine = firstToken.Column + firstToken.Text.Length + 1;                // start where first token left off
            line          = Tool.count(prefix, '\n') + 1;
            output.Append(prefix);

            // first identify oversize lists with separators
            IdentifyOversizeLists splitter = new IdentifyOversizeLists(corpus, testDoc.tokens, tokenToNodeMap);

            ParseTreeWalker.Default.Walk(splitter, testDoc.tree);
            tokenToListInfo = splitter.tokenToListInfo;

            realTokens = Trainer.getRealTokens(testDoc.tokens);
            for (int i = Trainer.ANALYSIS_START_TOKEN_INDEX; i < realTokens.Count; i++)
            {             // can't process first token
                int tokenIndexInStream = realTokens[i].TokenIndex;
                processToken(i, tokenIndexInStream, collectAnalysis);
            }

            releaseMemory();

            return(output.ToString());
        }
Exemple #2
0
 /// <summary>
 /// Free anything we can to reduce memory footprint after a format().
 ///  keep analysis, testDoc as they are used for results.
 /// </summary>
 public virtual void releaseMemory()
 {
     corpus                 = null;
     realTokens             = null;
     originalTokens         = null;
     tokenToNodeMap         = null;
     originalTokenToNodeMap = null;
     tokenToListInfo        = null;
     wsClassifier           = null;
     hposClassifier         = null;
 }