public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints) { collectedConstraints.RemoveSelfLinks(); _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10); _lattice = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer); // TODO: Allow external choice of splitting method Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray(); Func <AbstractNode, string[]> charSplitting = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray(); Func <AbstractNode, string[]> bigramSplitting = n => { if (n.Name.Length == 0) { return new string[] { "" } } ; var name = n.Name.ToLower(); return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray()); }; Func <AbstractNode, string[]> subtokenBigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2)); }).ToArray()); }; Func <AbstractNode, string[]> subtokenTrigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { if (sub.Length < 3) { return new string[] { sub } } ; return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3)); }).ToArray()); }; Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray(); _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)). ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))), s => new NodeName(subtokenSplitting(s))); NumRelationships = _lattice.NumRelationships; }