public virtual void addExemplar(InputDocument doc, int[] features, int ws, int hpos) { documentsPerExemplar.Add(doc); featureVectors.Add(features); injectWhitespace.Add(ws); this.hpos.Add(hpos); }
/// <summary> /// Feature vectors in X are lumped together as they are read in each /// document. In kNN, this tends to find features from the same document /// rather than from across the corpus since we grab k neighbors. /// For k=11, we might only see exemplars from a single corpus document. /// If all exemplars fit in k, this wouldn't be an issue. /// /// Fisher-Yates / Knuth shuffling /// "To shuffle an array a of n elements (indices 0..n-1)": /// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle /// </summary> public virtual void randomShuffleInPlace() { Random r = new Random(FEATURE_VECTOR_RANDOM_SEED); // for i from n−1 downto 1 do int n = featureVectors.Count; for (int i = n - 1; i >= 1; i--) { // j ← random integer such that 0 ≤ j ≤ i int j = r.Next(i + 1); // exchange a[j] and a[i] // Swap X int[] tmp = featureVectors[i]; featureVectors[i] = featureVectors[j]; featureVectors[j] = tmp; // And now swap all prediction lists int tmpI = injectWhitespace[i]; injectWhitespace[i] = injectWhitespace[j]; injectWhitespace[j] = tmpI; tmpI = hpos[i]; hpos[i] = hpos[j]; hpos[j] = tmpI; // Finally, swap documents InputDocument tmpD = documentsPerExemplar[i]; documentsPerExemplar[i] = documentsPerExemplar[j]; documentsPerExemplar[j] = tmpD; } }
public virtual string ToString(FeatureMetaData[] FEATURES, IList <int> Y) { int[] X = corpus.featureVectors[corpusVectorIndex]; InputDocument doc = corpus.documentsPerExemplar[corpusVectorIndex]; string features = Trainer._toString(FEATURES, doc, X); int line = X[Trainer.INDEX_INFO_LINE]; string lineText = doc.getLine(line); int col = X[Trainer.INDEX_INFO_CHARPOS]; // insert a dot right before char position if (!string.ReferenceEquals(lineText, null)) { lineText = lineText.Substring(0, col) + '\u00B7' + lineText.Substring(col, lineText.Length - col); } int cat = Y[corpusVectorIndex]; int[] elements = Trainer.triple(cat); // String display = String.format("%d|%d|%d", cat&0xFF, elements[0], elements[1]); string wsDisplay = Formatter.getWSCategoryStr(cat); string alignDisplay = Formatter.getHPosCategoryStr(cat); string display = !string.ReferenceEquals(wsDisplay, null) ? wsDisplay : alignDisplay; if (string.ReferenceEquals(display, null)) { display = string.Format("{0,8}", "none"); } return(string.Format("{0} ({1},d={2,1:F3}): {3}", features, display, distance, lineText)); }
/// <summary> /// Format the document. Does not affect/alter doc. </summary> public virtual string format(InputDocument doc, bool collectAnalysis) { if (testDoc != null) { throw new System.ArgumentException("can't call format > once"); } // for debugging we need a map from original token with actual line:col to tree node. used by token analysis originalDoc = doc; originalTokenToNodeMap = Trainer.indexTree(doc.tree); originalTokens = doc.tokens; this.testDoc = InputDocument.dup(doc); // make copy of doc, getting new tokens, tree output = new StringBuilder(); this.realTokens = Trainer.getRealTokens(testDoc.tokens); // squeeze out ws and kill any line/col info so we can't use ground truth by mistake wipeCharPositionInfoAndWhitespaceTokens(testDoc.tokens); // all except for first token wsClassifier = new kNNClassifier(corpus, wsFeatures, corpus.injectWhitespace); hposClassifier = new kNNClassifier(corpus, hposFeatures, corpus.hpos); analysis = new ArrayList <TokenPositionAnalysis>(testDoc.tokens.Size); for (int i = 0; i < testDoc.tokens.Size; ++i) { analysis.Add(null); } // make an index on the duplicated doc tree with tokens missing line:col info if (tokenToNodeMap == null) { tokenToNodeMap = Trainer.indexTree(testDoc.tree); } IToken firstToken = testDoc.tokens.getNextRealToken(-1); string prefix = originalTokens.GetText(Interval.Of(0, firstToken.TokenIndex)); // gets any comments in front + first real token charPosInLine = firstToken.Column + firstToken.Text.Length + 1; // start where first token left off line = Tool.count(prefix, '\n') + 1; output.Append(prefix); // first identify oversize lists with separators IdentifyOversizeLists splitter = new IdentifyOversizeLists(corpus, testDoc.tokens, tokenToNodeMap); ParseTreeWalker.Default.Walk(splitter, testDoc.tree); tokenToListInfo = splitter.tokenToListInfo; realTokens = Trainer.getRealTokens(testDoc.tokens); for (int i = Trainer.ANALYSIS_START_TOKEN_INDEX; i < realTokens.Count; i++) { // can't process first token int tokenIndexInStream = realTokens[i].TokenIndex; processToken(i, tokenIndexInStream, collectAnalysis); } releaseMemory(); return(output.ToString()); }
public virtual string getPredictionAnalysis(InputDocument doc, int k, int[] unknown, IList <int> Y, double distanceThreshold) { FeatureVectorAsObject key = new FeatureVectorAsObject(unknown, FEATURES); Neighbor[] kNN = null; neighborCache.TryGetValue(key, out kNN); nNNCalls++; if (kNN == null) { kNN = this.kNN(unknown, k, distanceThreshold); neighborCache[key] = kNN; } else { nNNCacheHits++; } IDictionary <int, MutableDouble> similarities = getCategoryToSimilarityMap(kNN, k, Y); int cat = getCategoryWithMaxValue(similarities); if (cat == -1) { // try with less strict match threshold to get some indication of alignment kNN = this.kNN(unknown, k, org.antlr.codebuff.Trainer.MAX_CONTEXT_DIFF_THRESHOLD2); similarities = getCategoryToSimilarityMap(kNN, k, Y); cat = getCategoryWithMaxValue(similarities); } string displayCat; int c = cat & 0xFF; if (c == org.antlr.codebuff.Trainer.CAT_INJECT_NL || c == org.antlr.codebuff.Trainer.CAT_INJECT_WS) { displayCat = Formatter.getWSCategoryStr(cat); } else { displayCat = Formatter.getHPosCategoryStr(cat); } displayCat = !string.ReferenceEquals(displayCat, null) ? displayCat : "none"; StringBuilder buf = new StringBuilder(); buf.Append(Trainer.featureNameHeader(FEATURES)); buf.Append(Trainer._toString(FEATURES, doc, unknown) + "->" + similarities + " predicts " + displayCat); buf.Append("\n"); if (kNN.Length > 0) { kNN = kNN.Take(Math.Min(k, kNN.Length)).ToArray(); foreach (Neighbor n in kNN) { buf.Append(n.ToString(FEATURES, Y)); buf.Append("\n"); } } return(buf.ToString()); }
public static InputDocument dup(InputDocument old) { if (!string.IsNullOrEmpty(old.fileName)) { // reparse to get new tokens, tree return(Tool.parse(old.fileName, old.language)); } else { return(Tool.parse(old.fileName, Tool.unformatted_input, old.language)); } }
public static InputDocument parse(string fileName, string content, LangDescriptor language) { ANTLRInputStream input = new ANTLRInputStream(content); Lexer lexer = getLexer(language.lexerClass, input); input.name = fileName; InputDocument doc = new InputDocument(fileName, content, language); doc.tokens = new CodeBuffTokenStream(lexer); doc.parser = getParser(language.parserClass, doc.tokens); doc.parser.BuildParseTree = true; // two-stage parsing. Try with SLL first doc.parser.Interpreter.PredictionMode = Antlr4.Runtime.Atn.PredictionMode.SLL; doc.parser.ErrorHandler = new BailErrorStrategy(); doc.parser.RemoveErrorListeners(); MethodInfo startRule = language.parserClass.GetMethod(language.startRuleName); try { doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null); } catch (Exception ex) { if (ex.InnerException is ParseCanceledException) { doc.parser.Reset(); doc.tokens.Reset(); // rewind input stream // back to standard listeners/handlers doc.parser.AddErrorListener(new ANTLRErrorListenerAnonymousInnerClass()); doc.parser.ErrorHandler = new DefaultErrorStrategy(); doc.parser.Interpreter.PredictionMode = PredictionMode.LL; doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null); if (doc.parser.NumberOfSyntaxErrors > 0) { doc.Tree = null; } } } return(doc); }
public virtual int[] getFeatures(InputDocument doc, int tokenIndexInStream) { Token prevToken = doc.tokens.getPreviousRealToken(tokenIndexInStream); Token prevPrevToken = prevToken != null?doc.tokens.getPreviousRealToken(prevToken.TokenIndex) : null; bool prevTokenStartsLine = false; if (prevToken != null && prevPrevToken != null) { prevTokenStartsLine = prevToken.Line > prevPrevToken.Line; } TerminalNode node = tokenToNodeMap[doc.tokens.Get(tokenIndexInStream)]; if (node == null) { Console.Error.WriteLine("### No node associated with token " + doc.tokens.Get(tokenIndexInStream)); return(null); } Token curToken = node.Symbol; bool curTokenStartsNewLine = false; if (prevToken == null) { curTokenStartsNewLine = true; // we must be at start of file } else if (line > prevToken.Line) { curTokenStartsNewLine = true; } int[] features = Trainer.getContextFeatures(corpus, tokenToNodeMap, doc, tokenIndexInStream); Trainer.setListInfoFeatures(tokenToListInfo, features, curToken); features[Trainer.INDEX_PREV_FIRST_ON_LINE] = prevTokenStartsLine ? 1 : 0; features[Trainer.INDEX_FIRST_ON_LINE] = curTokenStartsNewLine ? 1 : 0; return(features); }
public static void Main(string[] args) { if (args.Length < 2) { Console.Error.WriteLine("Dbg [-leave-one-out] [-java|-java8|-antlr|-sqlite|-tsql] test-file"); } int arg = 0; bool leaveOneOut = true; bool collectAnalysis = true; string language = args[arg++]; language = language.Substring(1); string testFilename = args[arg]; string output = "???"; InputDocument testDoc = null; IList <TokenPositionAnalysis> analysisPerToken = null; org.antlr.codebuff.misc.Pair <string, IList <TokenPositionAnalysis> > results; LangDescriptor lang = null; System.DateTime start, stop; for (int i = 0; i < Tool.languages.Length; i++) { if (Tool.languages[i].name.Equals(language)) { lang = Tool.languages[i]; break; } } if (lang != null) { start = System.DateTime.Now; LeaveOneOutValidator validator = new LeaveOneOutValidator(lang.corpusDir, lang); Triple <Formatter, float, float> val = validator.validateOneDocument(testFilename, null, collectAnalysis); testDoc = Tool.parse(testFilename, lang); stop = System.DateTime.Now; Formatter formatter = val.a; output = formatter.Output; Console.WriteLine("output len = " + output.Length); float editDistance = normalizedLevenshteinDistance(testDoc.content, output); Console.WriteLine("normalized Levenshtein distance: " + editDistance); analysisPerToken = formatter.AnalysisPerToken; Regex rex = new Regex("^\\s+$"); CommonTokenStream original_tokens = Tool.tokenize(testDoc.content, lang.lexerClass); IList <Token> wsTokens = BuffUtils.filter(original_tokens.GetTokens(), t => rex.IsMatch(t.Text)); string originalWS = tokenText(wsTokens); Console.WriteLine("origin ws tokens len: " + originalWS.Length); CommonTokenStream formatted_tokens = Tool.tokenize(output, lang.lexerClass); wsTokens = BuffUtils.filter(formatted_tokens.GetTokens(), t => rex.IsMatch(t.Text)); string formattedWS = tokenText(wsTokens); Console.WriteLine("formatted ws tokens len: " + formattedWS.Length); editDistance = levenshteinDistance(originalWS, formattedWS); editDistance /= Math.Max(testDoc.content.Length, output.Length); Console.WriteLine("Levenshtein distance of ws normalized to output len: " + editDistance); ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, analysisPerToken); Console.WriteLine(analysis); } if (lang != null) { // GUIController controller; // controller = new GUIController(analysisPerToken, testDoc, output, lang.lexerClass); //controller.show(); // System.out.println(output); //Console.Write("formatting time {0:D}s\n", (stop - start) / 1000000); Console.Write("classify calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, kNNClassifier.nClassifyCacheHits / (float)kNNClassifier.nClassifyCalls); Console.Write("kNN calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, kNNClassifier.nNNCacheHits / (float)kNNClassifier.nNNCalls); } }
public static string Main(object[] args) { Log.Reset(); try { if (args.Length < 7) { Log.WriteLine("org.antlr.codebuff.Tool -g grammar-name -rule start-rule -corpus root-dir-of-samples \\\n" + " [-files file-extension] [-indent num-spaces] \\" + " [-comment line-comment-name] [-o output-file] file-to-format"); return(Log.Message()); } formatted_output = null; string outputFileName = ""; string grammarName = null; string startRule = null; string corpusDir = null; string indentS = "4"; string commentS = null; string input_file_name = null; string fileExtension = null; int i = 0; Type parserClass = null; Type lexerClass = null; while (i < args.Length && ((string)args[i]).StartsWith("-", StringComparison.Ordinal)) { switch (args[i]) { case "-g": i++; grammarName = (string)args[i++]; break; case "-lexer": i++; lexerClass = (Type)args[i++]; break; case "-parser": i++; parserClass = (Type)args[i++]; break; case "-rule": i++; startRule = (string)args[i++]; break; case "-corpus": i++; corpusDir = (string)args[i++]; break; case "-files": i++; fileExtension = (string)args[i++]; break; case "-indent": i++; indentS = (string)args[i++]; break; case "-comment": i++; commentS = (string)args[i++]; break; case "-o": i++; outputFileName = (string)args[i++]; break; case "-inoutstring": i++; formatted_output = ""; outputFileName = null; break; } } input_file_name = (string)args[i]; // must be last Log.WriteLine("gramm: " + grammarName); string parserClassName = grammarName + "Parser"; string lexerClassName = grammarName + "Lexer"; Lexer lexer = null; if (lexerClass == null || parserClass == null) { Log.WriteLine("You must specify a lexer and parser."); } if (parserClass == null | lexerClass == null) { return(Log.Message()); } int indentSize = int.Parse(indentS); int singleLineCommentType = -1; if (!string.ReferenceEquals(commentS, null)) { try { lexer = getLexer(lexerClass, null); } catch (Exception e) { Log.WriteLine("Can't instantiate lexer " + lexerClassName); Log.WriteLine(e.StackTrace); } if (lexer == null) { return(Log.Message()); } IDictionary <string, int> tokenTypeMap = lexer.TokenTypeMap; if (tokenTypeMap.ContainsKey(commentS)) { singleLineCommentType = tokenTypeMap[commentS]; } } string fileRegex = null; if (!string.ReferenceEquals(fileExtension, null)) { var pattern = ""; var allowable_suffices = fileExtension.Split(';').ToList <string>(); foreach (var s in allowable_suffices) { var no_dot = s.Substring(s.IndexOf('.') + 1); pattern = pattern == "" ? ("(" + no_dot) : (pattern + "|" + no_dot); } pattern = pattern + ")"; fileRegex = ".*\\." + pattern; } LangDescriptor language = new LangDescriptor(grammarName, corpusDir, fileRegex, lexerClass, parserClass, startRule, indentSize, singleLineCommentType); //////// // load all corpus files up front IList <string> allFiles = getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = load(allFiles, language); // Handle formatting of document if it's passed as a string or not. if (unformatted_input == null) { // Don't include file to format in corpus itself. string path = System.IO.Path.GetFullPath(input_file_name); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); // Perform training of formatter. Corpus corpus = new Corpus(others, language); corpus.train(); // Parse code contained in file. InputDocument unformatted_document = parse(input_file_name, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } else { // Perform training of formatter. Corpus corpus = new Corpus(documents, language); corpus.train(); // Parse code that was represented as a string. InputDocument unformatted_document = parse(input_file_name, unformatted_input, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } /////// if (outputFileName != null && outputFileName == "") { Log.WriteLine(formatted_output); } else if (!string.IsNullOrEmpty(outputFileName)) { org.antlr.codebuff.misc.Utils.writeFile(outputFileName, formatted_output); } } catch (Exception e) { throw e; } return(formatted_output); }
public static void Main(string[] args) { if (args.Length < 7) { Console.Error.WriteLine("org.antlr.codebuff.Tool -g grammar-name -rule start-rule -corpus root-dir-of-samples \\\n" + " [-files file-extension] [-indent num-spaces] \\" + " [-comment line-comment-name] [-o output-file] file-to-format"); return; } formatted_output = null; string outputFileName = ""; string grammarName = null; string startRule = null; string corpusDir = null; string indentS = "4"; string commentS = null; string input_file_name = null; string fileExtension = null; int i = 0; while (i < args.Length && args[i].StartsWith("-", StringComparison.Ordinal)) { switch (args[i]) { case "-g": i++; grammarName = args[i++]; break; case "-rule": i++; startRule = args[i++]; break; case "-corpus": i++; corpusDir = args[i++]; break; case "-files": i++; fileExtension = args[i++]; break; case "-indent": i++; indentS = args[i++]; break; case "-comment": i++; commentS = args[i++]; break; case "-o": i++; outputFileName = args[i++]; break; case "-inoutstring": i++; formatted_output = ""; outputFileName = null; break; } } input_file_name = args[i]; // must be last Console.WriteLine("gramm: " + grammarName); string parserClassName = grammarName + "Parser"; string lexerClassName = grammarName + "Lexer"; Type parserClass = null; Type lexerClass = null; Lexer lexer = null; try { parserClass = (Type)Type.GetType(parserClassName); lexerClass = (Type)Type.GetType(lexerClassName); } catch (Exception e) { Console.Error.WriteLine("Can't load " + parserClassName + " or maybe " + lexerClassName); Console.Error.WriteLine("Make sure they are generated by ANTLR, compiled, and in CLASSPATH"); System.Console.WriteLine(e.StackTrace); } if (parserClass == null | lexerClass == null) { return; // don't return from catch! } int indentSize = int.Parse(indentS); int singleLineCommentType = -1; if (!string.ReferenceEquals(commentS, null)) { try { lexer = getLexer(lexerClass, null); } catch (Exception e) { Console.Error.WriteLine("Can't instantiate lexer " + lexerClassName); System.Console.WriteLine(e.StackTrace); } if (lexer == null) { return; } IDictionary <string, int> tokenTypeMap = lexer.TokenTypeMap; if (tokenTypeMap.ContainsKey(commentS)) { singleLineCommentType = tokenTypeMap[commentS]; } } string fileRegex = null; if (!string.ReferenceEquals(fileExtension, null)) { fileRegex = ".*\\." + fileExtension; } LangDescriptor language = new LangDescriptor(grammarName, corpusDir, fileRegex, lexerClass, parserClass, startRule, indentSize, singleLineCommentType); //////// // load all corpus files up front IList <string> allFiles = getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = load(allFiles, language); // Handle formatting of document if it's passed as a string or not. if (unformatted_input == null) { // Don't include file to format in corpus itself. string path = System.IO.Path.GetFullPath(input_file_name); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); // Perform training of formatter. Corpus corpus = new Corpus(others, language); corpus.train(); // Parse code contained in file. InputDocument unformatted_document = parse(input_file_name, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } else { // Perform training of formatter. Corpus corpus = new Corpus(documents, language); corpus.train(); // Parse code that was represented as a string. InputDocument unformatted_document = parse(input_file_name, unformatted_input, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } /////// if (outputFileName != null && outputFileName == "") { System.Console.WriteLine(formatted_output); } else if (!string.IsNullOrEmpty(outputFileName)) { org.antlr.codebuff.misc.Utils.writeFile(outputFileName, formatted_output); } }