Esempio n. 1
0
        /// <summary>
        /// Split on a specific node. This gets a type and splits it into
        /// two subtypes.
        /// </summary>
        /// <param name="nodesInType"></param>
        /// <param name="typeParents"></param>
        /// <param name="node"></param>
        /// <returns></returns>
        public static SublatticeModification <T> SplitOn <T>(HashSet <LatticeNode <T> > nodesInType, HashSet <LatticeNode <T> > typeParents, LatticeNode <T> node)
        {
            //Debug.Assert(typeParents.Except(nodesInType).Count() == 0);

            HashSet <LatticeNode <T> > transitiveClosuresForNewType = AbstractColoredLattice <T> .GetTransitiveChildrenClosure(nodesInType, node);

            var modification = new SublatticeModification <T>()
            {
                Type   = ModificationType.Split,
                Data   = node,
                Before = new List <HashSet <LatticeNode <T> > >()
                {
                    nodesInType
                },
                BeforeParent = new List <HashSet <LatticeNode <T> > >()
                {
                    typeParents
                },
                After = new List <HashSet <LatticeNode <T> > >()
                {
                    new HashSet <LatticeNode <T> >(nodesInType.Except(transitiveClosuresForNewType)),
                    new HashSet <LatticeNode <T> >(transitiveClosuresForNewType)
                },
                AfterParents = new List <HashSet <LatticeNode <T> > >()
                {
                    new HashSet <LatticeNode <T> >(typeParents.Except(transitiveClosuresForNewType)),
                    new HashSet <LatticeNode <T> >(transitiveClosuresForNewType.Where(n => typeParents.Contains(n) || n.Parents.Any(p => !transitiveClosuresForNewType.Contains(p))).Concat(new List <LatticeNode <T> >()
                    {
                        node
                    }))
                }
            };

            // Does this split a cycle?
            var parentCluster   = modification.After[0];
            var childrenCluster = modification.After[1];

            Debug.Assert(parentCluster.Intersect(childrenCluster).Count() == 0);
            Debug.Assert(modification.AfterParents[0].Except(parentCluster).Count() == 0);
            Debug.Assert(modification.AfterParents[1].Except(childrenCluster).Count() == 0);

            var parClosure = AbstractColoredLattice <T> .GetTransitiveParentClosure(parentCluster);

            parClosure.IntersectWith(childrenCluster);
            if (parClosure.Count != 0)
            {
                return(null);                       // We are trying to split a circle
            }
            return(modification);
        }
Esempio n. 2
0
        public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints)
        {
            collectedConstraints.RemoveSelfLinks();

            _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10);
            _lattice    = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer);

            // TODO: Allow external choice of splitting method
            Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray();
            Func <AbstractNode, string[]> charSplitting     = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray();
            Func <AbstractNode, string[]> bigramSplitting   = n =>
            {
                if (n.Name.Length == 0)
                {
                    return new string[] { "" }
                }
                ;
                var name = n.Name.ToLower();
                return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray());
            };
            Func <AbstractNode, string[]> subtokenBigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2));
                }).ToArray());
            };
            Func <AbstractNode, string[]> subtokenTrigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    if (sub.Length < 3)
                    {
                        return new string[] { sub }
                    }
                    ;
                    return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3));
                }).ToArray());
            };

            Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray();

            _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)).
                                    ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))),
                                    s => new NodeName(subtokenSplitting(s)));
            NumRelationships = _lattice.NumRelationships;
        }
Esempio n. 3
0
        public (List <HashSet <AbstractNode> > Clusters, List <HashSet <int> > ClusterParents) InferColors()
        {
            var coloring = _lattice.InferColoring(out double score);

            Console.WriteLine($"Color infrence completed. Score: {score}");

            // Convert to AbstractNode
            var inverseMap  = _nodeMap.ToDictionary(kv => kv.Value, kv => kv.Key);
            var colorGroups = coloring.Select(g => new HashSet <AbstractNode>(g.Select(n => inverseMap[n]))).ToList();

            var parents = new List <HashSet <int> >();
            int colorID = 0;

            foreach (var color in coloring)
            {
                var nodesInColor   = new HashSet <LatticeNode <NodeName> >(color);
                var parentNodes    = new HashSet <LatticeNode <NodeName> >(color.SelectMany(n => n.Parents).Where(n => !nodesInColor.Contains(n)));
                var parentColorIds = new HashSet <int>();
                foreach (var parentNode in parentNodes)
                {
                    for (int i = 0; i < coloring.Count; i++)
                    {
                        if (coloring[i].Contains(parentNode))
                        {
                            parentColorIds.Add(i);
                            break;
                        }
                    }
                }
                parents.Add(parentColorIds);
                var ancestors = AbstractColoredLattice <NodeName> .GetClusterAncestors(color, coloring);

                var ancestorIDs = ancestors.Select(i => coloring.IndexOf(i)).ToList();
                AncestorMap.Add(colorID++, ancestorIDs);
            }
            return(colorGroups, parents);
        }
Esempio n. 4
0
        public SubtypeMiner(HashSet <string> types, TypeConstraints collectedConstraints, int maxNumTypes, bool UDTSpecificAnalysis = false, ITypeSymbol t = null)
        {
            collectedConstraints.RemoveSelfLinks();

            _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 2);
            _lattice    = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer);

            // TODO: Allow external choice of splitting type
            Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray();
            Func <AbstractNode, string[]> charSplitting     = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray();
            Func <AbstractNode, string[]> bigramSplitting   = n =>
            {
                if (n.Name.Length == 0)
                {
                    return new string[] { "" }
                }
                ;
                var name = n.Name.ToLower();
                return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray());
            };
            Func <AbstractNode, string[]> subtokenBigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2));
                }).ToArray());
            };
            Func <AbstractNode, string[]> subtokenTrigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    if (sub.Length < 3)
                    {
                        return new string[] { sub }
                    }
                    ;
                    return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3));
                }).ToArray());
            };

            Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray();

            if (UDTSpecificAnalysis)
            {
                /*IEnumerable<KeyValuePair<AbstractNode, HashSet<AbstractNode>>> nodes = collectedConstraints.AllRelationships.Where(kv => kv.Key.IsSymbol);
                 * var symbols = nodes.Where(kv => (kv.Key as VariableSymbol) !=null);*/

                var symbols = GetTypeSpecificRelations(collectedConstraints.AllRelationships, t);

                _nodeMap = _lattice.Add(
                    symbols.ToDictionary(
                        kv => kv.Key,
                        kv => new HashSet <AbstractNode>(kv.Value)
                        ),
                    s => new NodeName(subtokenSplitting(s))
                    );
            }
            else
            {
                var selKeys = collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type));
                _nodeMap = _lattice.Add(selKeys.
                                        ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))),
                                        s => new NodeName(subtokenSplitting(s)));
            }
            NumRelationships = _lattice.NumRelationships;
        }