/// <summary> /// Split on a specific node. This gets a type and splits it into /// two subtypes. /// </summary> /// <param name="nodesInType"></param> /// <param name="typeParents"></param> /// <param name="node"></param> /// <returns></returns> public static SublatticeModification <T> SplitOn <T>(HashSet <LatticeNode <T> > nodesInType, HashSet <LatticeNode <T> > typeParents, LatticeNode <T> node) { //Debug.Assert(typeParents.Except(nodesInType).Count() == 0); HashSet <LatticeNode <T> > transitiveClosuresForNewType = AbstractColoredLattice <T> .GetTransitiveChildrenClosure(nodesInType, node); var modification = new SublatticeModification <T>() { Type = ModificationType.Split, Data = node, Before = new List <HashSet <LatticeNode <T> > >() { nodesInType }, BeforeParent = new List <HashSet <LatticeNode <T> > >() { typeParents }, After = new List <HashSet <LatticeNode <T> > >() { new HashSet <LatticeNode <T> >(nodesInType.Except(transitiveClosuresForNewType)), new HashSet <LatticeNode <T> >(transitiveClosuresForNewType) }, AfterParents = new List <HashSet <LatticeNode <T> > >() { new HashSet <LatticeNode <T> >(typeParents.Except(transitiveClosuresForNewType)), new HashSet <LatticeNode <T> >(transitiveClosuresForNewType.Where(n => typeParents.Contains(n) || n.Parents.Any(p => !transitiveClosuresForNewType.Contains(p))).Concat(new List <LatticeNode <T> >() { node })) } }; // Does this split a cycle? var parentCluster = modification.After[0]; var childrenCluster = modification.After[1]; Debug.Assert(parentCluster.Intersect(childrenCluster).Count() == 0); Debug.Assert(modification.AfterParents[0].Except(parentCluster).Count() == 0); Debug.Assert(modification.AfterParents[1].Except(childrenCluster).Count() == 0); var parClosure = AbstractColoredLattice <T> .GetTransitiveParentClosure(parentCluster); parClosure.IntersectWith(childrenCluster); if (parClosure.Count != 0) { return(null); // We are trying to split a circle } return(modification); }
public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints) { collectedConstraints.RemoveSelfLinks(); _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10); _lattice = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer); // TODO: Allow external choice of splitting method Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray(); Func <AbstractNode, string[]> charSplitting = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray(); Func <AbstractNode, string[]> bigramSplitting = n => { if (n.Name.Length == 0) { return new string[] { "" } } ; var name = n.Name.ToLower(); return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray()); }; Func <AbstractNode, string[]> subtokenBigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2)); }).ToArray()); }; Func <AbstractNode, string[]> subtokenTrigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { if (sub.Length < 3) { return new string[] { sub } } ; return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3)); }).ToArray()); }; Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray(); _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)). ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))), s => new NodeName(subtokenSplitting(s))); NumRelationships = _lattice.NumRelationships; }
public (List <HashSet <AbstractNode> > Clusters, List <HashSet <int> > ClusterParents) InferColors() { var coloring = _lattice.InferColoring(out double score); Console.WriteLine($"Color infrence completed. Score: {score}"); // Convert to AbstractNode var inverseMap = _nodeMap.ToDictionary(kv => kv.Value, kv => kv.Key); var colorGroups = coloring.Select(g => new HashSet <AbstractNode>(g.Select(n => inverseMap[n]))).ToList(); var parents = new List <HashSet <int> >(); int colorID = 0; foreach (var color in coloring) { var nodesInColor = new HashSet <LatticeNode <NodeName> >(color); var parentNodes = new HashSet <LatticeNode <NodeName> >(color.SelectMany(n => n.Parents).Where(n => !nodesInColor.Contains(n))); var parentColorIds = new HashSet <int>(); foreach (var parentNode in parentNodes) { for (int i = 0; i < coloring.Count; i++) { if (coloring[i].Contains(parentNode)) { parentColorIds.Add(i); break; } } } parents.Add(parentColorIds); var ancestors = AbstractColoredLattice <NodeName> .GetClusterAncestors(color, coloring); var ancestorIDs = ancestors.Select(i => coloring.IndexOf(i)).ToList(); AncestorMap.Add(colorID++, ancestorIDs); } return(colorGroups, parents); }
public SubtypeMiner(HashSet <string> types, TypeConstraints collectedConstraints, int maxNumTypes, bool UDTSpecificAnalysis = false, ITypeSymbol t = null) { collectedConstraints.RemoveSelfLinks(); _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 2); _lattice = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer); // TODO: Allow external choice of splitting type Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray(); Func <AbstractNode, string[]> charSplitting = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray(); Func <AbstractNode, string[]> bigramSplitting = n => { if (n.Name.Length == 0) { return new string[] { "" } } ; var name = n.Name.ToLower(); return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray()); }; Func <AbstractNode, string[]> subtokenBigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2)); }).ToArray()); }; Func <AbstractNode, string[]> subtokenTrigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { if (sub.Length < 3) { return new string[] { sub } } ; return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3)); }).ToArray()); }; Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray(); if (UDTSpecificAnalysis) { /*IEnumerable<KeyValuePair<AbstractNode, HashSet<AbstractNode>>> nodes = collectedConstraints.AllRelationships.Where(kv => kv.Key.IsSymbol); * var symbols = nodes.Where(kv => (kv.Key as VariableSymbol) !=null);*/ var symbols = GetTypeSpecificRelations(collectedConstraints.AllRelationships, t); _nodeMap = _lattice.Add( symbols.ToDictionary( kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value) ), s => new NodeName(subtokenSplitting(s)) ); } else { var selKeys = collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)); _nodeMap = _lattice.Add(selKeys. ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))), s => new NodeName(subtokenSplitting(s))); } NumRelationships = _lattice.NumRelationships; }