public void TestSingleNumeric()
        {
            var subtoks = SubtokenSplitter.SplitSubtokens("str0");

            Assert.AreEqual(subtoks.Length, 2);
            Assert.AreEqual(subtoks[0], "str");
            Assert.AreEqual(subtoks[1], "0");
        }
        public void TestWithUnderscores()
        {
            var subtoks = SubtokenSplitter.SplitSubtokens("SOME_WEIRD_NAME");

            Assert.AreEqual(subtoks[0], "some");
            Assert.AreEqual(subtoks[1], "weird");
            Assert.AreEqual(subtoks[2], "name");
        }
        public void TestWithContigiousCapitalization()
        {
            var subtoks = SubtokenSplitter.SplitSubtokens("ThisASTIsBlue");

            Assert.AreEqual(subtoks[0], "this");
            Assert.AreEqual(subtoks[1], "ast");
            Assert.AreEqual(subtoks[2], "is");
            Assert.AreEqual(subtoks[3], "blue");
        }
        public void TestSimpleSplit()
        {
            var subtoks = SubtokenSplitter.SplitSubtokens("someSimpleTest1");

            Assert.AreEqual(subtoks[0], "some");
            Assert.AreEqual(subtoks[1], "simple");
            Assert.AreEqual(subtoks[2], "test");
            Assert.AreEqual(subtoks[3], "1");
        }
Beispiel #5
0
        public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints)
        {
            collectedConstraints.RemoveSelfLinks();

            _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10);
            _lattice    = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer);

            // TODO: Allow external choice of splitting method
            Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray();
            Func <AbstractNode, string[]> charSplitting     = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray();
            Func <AbstractNode, string[]> bigramSplitting   = n =>
            {
                if (n.Name.Length == 0)
                {
                    return new string[] { "" }
                }
                ;
                var name = n.Name.ToLower();
                return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray());
            };
            Func <AbstractNode, string[]> subtokenBigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2));
                }).ToArray());
            };
            Func <AbstractNode, string[]> subtokenTrigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    if (sub.Length < 3)
                    {
                        return new string[] { sub }
                    }
                    ;
                    return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3));
                }).ToArray());
            };

            Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray();

            _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)).
                                    ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))),
                                    s => new NodeName(subtokenSplitting(s)));
            NumRelationships = _lattice.NumRelationships;
        }
Beispiel #6
0
        public SubtypeMiner(HashSet <string> types, TypeConstraints collectedConstraints, int maxNumTypes, bool UDTSpecificAnalysis = false, ITypeSymbol t = null)
        {
            collectedConstraints.RemoveSelfLinks();

            _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 2);
            _lattice    = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer);

            // TODO: Allow external choice of splitting type
            Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray();
            Func <AbstractNode, string[]> charSplitting     = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray();
            Func <AbstractNode, string[]> bigramSplitting   = n =>
            {
                if (n.Name.Length == 0)
                {
                    return new string[] { "" }
                }
                ;
                var name = n.Name.ToLower();
                return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray());
            };
            Func <AbstractNode, string[]> subtokenBigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2));
                }).ToArray());
            };
            Func <AbstractNode, string[]> subtokenTrigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    if (sub.Length < 3)
                    {
                        return new string[] { sub }
                    }
                    ;
                    return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3));
                }).ToArray());
            };

            Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray();

            if (UDTSpecificAnalysis)
            {
                /*IEnumerable<KeyValuePair<AbstractNode, HashSet<AbstractNode>>> nodes = collectedConstraints.AllRelationships.Where(kv => kv.Key.IsSymbol);
                 * var symbols = nodes.Where(kv => (kv.Key as VariableSymbol) !=null);*/

                var symbols = GetTypeSpecificRelations(collectedConstraints.AllRelationships, t);

                _nodeMap = _lattice.Add(
                    symbols.ToDictionary(
                        kv => kv.Key,
                        kv => new HashSet <AbstractNode>(kv.Value)
                        ),
                    s => new NodeName(subtokenSplitting(s))
                    );
            }
            else
            {
                var selKeys = collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type));
                _nodeMap = _lattice.Add(selKeys.
                                        ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))),
                                        s => new NodeName(subtokenSplitting(s)));
            }
            NumRelationships = _lattice.NumRelationships;
        }
        public void TestEmpty()
        {
            var subtoks = SubtokenSplitter.SplitSubtokens("");

            Assert.AreEqual(subtoks.Length, 0);
        }