Exemplo n.º 1
0
        public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints)
        {
            collectedConstraints.RemoveSelfLinks();

            _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10);
            _lattice    = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer);

            // TODO: Allow external choice of splitting method
            Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray();
            Func <AbstractNode, string[]> charSplitting     = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray();
            Func <AbstractNode, string[]> bigramSplitting   = n =>
            {
                if (n.Name.Length == 0)
                {
                    return new string[] { "" }
                }
                ;
                var name = n.Name.ToLower();
                return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray());
            };
            Func <AbstractNode, string[]> subtokenBigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2));
                }).ToArray());
            };
            Func <AbstractNode, string[]> subtokenTrigramSplitting = n =>
            {
                return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub =>
                {
                    if (sub.Length < 3)
                    {
                        return new string[] { sub }
                    }
                    ;
                    return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3));
                }).ToArray());
            };

            Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray();

            _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)).
                                    ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))),
                                    s => new NodeName(subtokenSplitting(s)));
            NumRelationships = _lattice.NumRelationships;
        }