/// <summary> /// Format the document. Does not affect/alter doc. </summary> public virtual string format(InputDocument doc, bool collectAnalysis) { if (testDoc != null) { throw new System.ArgumentException("can't call format > once"); } // for debugging we need a map from original token with actual line:col to tree node. used by token analysis originalDoc = doc; originalTokenToNodeMap = Trainer.indexTree(doc.tree); originalTokens = doc.tokens; this.testDoc = InputDocument.dup(doc); // make copy of doc, getting new tokens, tree output = new StringBuilder(); this.realTokens = Trainer.getRealTokens(testDoc.tokens); // squeeze out ws and kill any line/col info so we can't use ground truth by mistake wipeCharPositionInfoAndWhitespaceTokens(testDoc.tokens); // all except for first token wsClassifier = new kNNClassifier(corpus, wsFeatures, corpus.injectWhitespace); hposClassifier = new kNNClassifier(corpus, hposFeatures, corpus.hpos); analysis = new ArrayList <TokenPositionAnalysis>(testDoc.tokens.Size); for (int i = 0; i < testDoc.tokens.Size; ++i) { analysis.Add(null); } // make an index on the duplicated doc tree with tokens missing line:col info if (tokenToNodeMap == null) { tokenToNodeMap = Trainer.indexTree(testDoc.tree); } IToken firstToken = testDoc.tokens.getNextRealToken(-1); string prefix = originalTokens.GetText(Interval.Of(0, firstToken.TokenIndex)); // gets any comments in front + first real token charPosInLine = firstToken.Column + firstToken.Text.Length + 1; // start where first token left off line = Tool.count(prefix, '\n') + 1; output.Append(prefix); // first identify oversize lists with separators IdentifyOversizeLists splitter = new IdentifyOversizeLists(corpus, testDoc.tokens, tokenToNodeMap); ParseTreeWalker.Default.Walk(splitter, testDoc.tree); tokenToListInfo = splitter.tokenToListInfo; realTokens = Trainer.getRealTokens(testDoc.tokens); for (int i = Trainer.ANALYSIS_START_TOKEN_INDEX; i < realTokens.Count; i++) { // can't process first token int tokenIndexInStream = realTokens[i].TokenIndex; processToken(i, tokenIndexInStream, collectAnalysis); } releaseMemory(); return(output.ToString()); }
/// <summary> /// Free anything we can to reduce memory footprint after a format(). /// keep analysis, testDoc as they are used for results. /// </summary> public virtual void releaseMemory() { corpus = null; realTokens = null; originalTokens = null; tokenToNodeMap = null; originalTokenToNodeMap = null; tokenToListInfo = null; wsClassifier = null; hposClassifier = null; }