コード例 #1
0
ファイル: Formatter.cs プロジェクト: mhornsby/cs-codebuff
        public static void wipeCharPositionInfoAndWhitespaceTokens(CodeBuffTokenStream tokens)
        {
            tokens.Fill();
            CommonToken dummy = new CommonToken(TokenConstants.InvalidType, "");

            dummy.Channel = TokenConstants.HiddenChannel;
            Token firstRealToken = tokens.getNextRealToken(-1);

            for (int i = 0; i < tokens.Size; i++)
            {
                if (i == firstRealToken.TokenIndex)
                {
                    continue;                     // don't wack first token
                }
                CommonToken t   = (CommonToken)tokens.Get(i);
                Regex       rex = new Regex("^\\s+$");
                if (rex.IsMatch(t.Text))
                {
                    tokens.GetTokens()[i] = dummy;
                    // wack whitespace token so we can't use it during prediction
                }
                else
                {
                    t.Line   = 0;
                    t.Column = -1;
                }
            }
        }
コード例 #2
0
ファイル: Dbg.cs プロジェクト: mhornsby/cs-codebuff
            public static void Main(string[] args)
            {
                ANTLRv4Lexer      lexer  = new ANTLRv4Lexer(new ANTLRFileStream("grammars/org/antlr/codebuff/ANTLRv4Lexer.g4"));
                CommonTokenStream tokens = new CodeBuffTokenStream(lexer);
                ANTLRv4Parser     parser = new ANTLRv4Parser(tokens);

                ANTLRv4Parser.GrammarSpecContext tree = parser.grammarSpec();
                Console.WriteLine(tree.ToStringTree(parser));
            }
コード例 #3
0
ファイル: Formatter.cs プロジェクト: mhornsby/cs-codebuff
        /// <summary>
        /// Format the document. Does not affect/alter doc. </summary>
        public virtual string format(InputDocument doc, bool collectAnalysis)
        {
            if (testDoc != null)
            {
                throw new System.ArgumentException("can't call format > once");
            }
            // for debugging we need a map from original token with actual line:col to tree node. used by token analysis
            originalDoc            = doc;
            originalTokenToNodeMap = Trainer.indexTree(doc.tree);
            originalTokens         = doc.tokens;

            this.testDoc    = InputDocument.dup(doc);          // make copy of doc, getting new tokens, tree
            output          = new StringBuilder();
            this.realTokens = Trainer.getRealTokens(testDoc.tokens);
            // squeeze out ws and kill any line/col info so we can't use ground truth by mistake
            wipeCharPositionInfoAndWhitespaceTokens(testDoc.tokens);             // all except for first token
            wsClassifier   = new kNNClassifier(corpus, wsFeatures, corpus.injectWhitespace);
            hposClassifier = new kNNClassifier(corpus, hposFeatures, corpus.hpos);

            analysis = new ArrayList <TokenPositionAnalysis>(testDoc.tokens.Size);
            for (int i = 0; i < testDoc.tokens.Size; ++i)
            {
                analysis.Add(null);
            }

            // make an index on the duplicated doc tree with tokens missing line:col info
            if (tokenToNodeMap == null)
            {
                tokenToNodeMap = Trainer.indexTree(testDoc.tree);
            }

            IToken firstToken = testDoc.tokens.getNextRealToken(-1);

            string prefix = originalTokens.GetText(Interval.Of(0, firstToken.TokenIndex)); // gets any comments in front + first real token

            charPosInLine = firstToken.Column + firstToken.Text.Length + 1;                // start where first token left off
            line          = Tool.count(prefix, '\n') + 1;
            output.Append(prefix);

            // first identify oversize lists with separators
            IdentifyOversizeLists splitter = new IdentifyOversizeLists(corpus, testDoc.tokens, tokenToNodeMap);

            ParseTreeWalker.Default.Walk(splitter, testDoc.tree);
            tokenToListInfo = splitter.tokenToListInfo;

            realTokens = Trainer.getRealTokens(testDoc.tokens);
            for (int i = Trainer.ANALYSIS_START_TOKEN_INDEX; i < realTokens.Count; i++)
            {             // can't process first token
                int tokenIndexInStream = realTokens[i].TokenIndex;
                processToken(i, tokenIndexInStream, collectAnalysis);
            }

            releaseMemory();

            return(output.ToString());
        }
コード例 #4
0
        public static CodeBuffTokenStream tokenize(string doc, Type lexerClass)
        {
            ANTLRInputStream input = new ANTLRInputStream(doc);
            Lexer            lexer = getLexer(lexerClass, input);

            CodeBuffTokenStream tokens = new CodeBuffTokenStream(lexer);

            tokens.Fill();
            return(tokens);
        }
コード例 #5
0
ファイル: Formatter.cs プロジェクト: mhornsby/cs-codebuff
 /// <summary>
 /// Free anything we can to reduce memory footprint after a format().
 ///  keep analysis, testDoc as they are used for results.
 /// </summary>
 public virtual void releaseMemory()
 {
     corpus                 = null;
     realTokens             = null;
     originalTokens         = null;
     tokenToNodeMap         = null;
     originalTokenToNodeMap = null;
     tokenToListInfo        = null;
     wsClassifier           = null;
     hposClassifier         = null;
 }
コード例 #6
0
ファイル: Dbg.cs プロジェクト: mhornsby/cs-codebuff
        /// <summary>
        /// Compute a document difference metric 0-1.0 between two documents that
        ///  are identical other than (likely) the whitespace and comments.
        ///
        ///  1.0 means the docs are maximally different and 0 means docs are identical.
        ///
        ///  The Levenshtein distance between the docs counts only
        ///  whitespace diffs as the non-WS content is identical.
        ///  Levenshtein distance is bounded by 0..max(len(doc1),len(doc2)) so
        ///  we normalize the distance by dividing by max WS count.
        ///
        ///  TODO: can we simplify this to a simple walk with two
        ///  cursors through the original vs formatted counting
        ///  mismatched whitespace? real text are like anchors.
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public static double docDiff(String original, String formatted, Class lexerClass) throws Exception
        public static double docDiff(string original, string formatted, Type lexerClass)
        {
            // Grammar must strip all but real tokens and whitespace (and put that on hidden channel)
            CodeBuffTokenStream original_tokens = Tool.tokenize(original, lexerClass);
            //		String s = original_tokens.getText();
            CodeBuffTokenStream formatted_tokens = Tool.tokenize(formatted, lexerClass);
            //		String t = formatted_tokens.getText();

            // walk token streams and examine whitespace in between tokens
            int i            = -1;
            int ws_distance  = 0;
            int original_ws  = 0;
            int formatted_ws = 0;

            while (true)
            {
                Token ot = original_tokens.LT(i);                 // TODO: FIX THIS! can't use LT()
                if (ot == null || ot.Type == TokenConstants.EOF)
                {
                    break;
                }
                IList <Token> ows = original_tokens.GetHiddenTokensToLeft(ot.TokenIndex);
                original_ws += tokenText(ows).Length;

                Token ft = formatted_tokens.LT(i);                 // TODO: FIX THIS! can't use LT()
                if (ft == null || ft.Type == TokenConstants.EOF)
                {
                    break;
                }
                IList <Token> fws = formatted_tokens.GetHiddenTokensToLeft(ft.TokenIndex);
                formatted_ws += tokenText(fws).Length;

                ws_distance += whitespaceEditDistance(tokenText(ows), tokenText(fws));
                i++;
            }
            // it's probably ok to ignore ws diffs after last real token

            int    max_ws = Math.Max(original_ws, formatted_ws);
            double normalized_ws_distance = ((float)ws_distance) / max_ws;

            return(normalized_ws_distance);
        }
コード例 #7
0
 // reuse object so the maps above fill from multiple files during training
 public virtual void setTokens(CodeBuffTokenStream tokens, ParserRuleContext root, IDictionary <Token, TerminalNode> tokenToNodeMap)
 {
     this.tokens         = tokens;
     this.tokenToNodeMap = tokenToNodeMap;
 }
コード例 #8
0
        /// <summary>
        /// Return map for the various tokens related to this list re list membership </summary>
        public static IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > getInfoAboutListTokens <T1>(ParserRuleContext ctx, CodeBuffTokenStream tokens, IDictionary <Token, TerminalNode> tokenToNodeMap, IList <T1> siblings, bool isOversizeList) where T1 : Antlr4.Runtime.ParserRuleContext
        {
            IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > tokenToListInfo = new Dictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> >();

            ParserRuleContext first = siblings[0] as ParserRuleContext;
            ParserRuleContext last  = siblings[siblings.Count - 1] as ParserRuleContext;

            Token prefixToken = tokens.getPreviousRealToken(first.Start.TokenIndex);       // e.g., '(' in an arg list or ':' in grammar def
            Token suffixToken = tokens.getNextRealToken(last.Stop.TokenIndex);             // e.g., LT(1) is last token of list; LT(2) is ')' in an arg list of ';' in grammar def

            TerminalNode prefixNode           = tokenToNodeMap[prefixToken];
            TerminalNode suffixNode           = tokenToNodeMap[suffixToken];
            bool         hasSurroundingTokens = prefixNode != null && prefixNode.Parent == suffixNode.Parent;

            if (hasSurroundingTokens)
            {
                tokenToListInfo[prefixToken] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_PREFIX);
                tokenToListInfo[suffixToken] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_SUFFIX);
            }

            IList <Tree> separators = getSeparators(ctx, siblings);
            Tree         firstSep   = separators[0];

            tokenToListInfo[(Token)firstSep.Payload] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_FIRST_SEPARATOR);
            foreach (Tree s in separators.Where((e, i) => i > 0 && i < separators.Count))
            {
                tokenToListInfo[(Token)s.Payload] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_SEPARATOR);
            }

            // handle sibling members
            tokenToListInfo[first.Start] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_FIRST_ELEMENT);
            foreach (T1 ss in siblings.Where((e, i) => i > 0 && i < siblings.Count))
            {
                var s = ss as ParserRuleContext;
                tokenToListInfo[s.Start] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_MEMBER);
            }

            return(tokenToListInfo);
        }
コード例 #9
0
 public IdentifyOversizeLists(Corpus corpus, CodeBuffTokenStream tokens, IDictionary <Token, TerminalNode> tokenToNodeMap)
 {
     this.corpus         = corpus;
     this.tokens         = tokens;
     this.tokenToNodeMap = tokenToNodeMap;
 }