public void TestSingleNumeric() { var subtoks = SubtokenSplitter.SplitSubtokens("str0"); Assert.AreEqual(subtoks.Length, 2); Assert.AreEqual(subtoks[0], "str"); Assert.AreEqual(subtoks[1], "0"); }
public void TestWithUnderscores() { var subtoks = SubtokenSplitter.SplitSubtokens("SOME_WEIRD_NAME"); Assert.AreEqual(subtoks[0], "some"); Assert.AreEqual(subtoks[1], "weird"); Assert.AreEqual(subtoks[2], "name"); }
public void TestWithContigiousCapitalization() { var subtoks = SubtokenSplitter.SplitSubtokens("ThisASTIsBlue"); Assert.AreEqual(subtoks[0], "this"); Assert.AreEqual(subtoks[1], "ast"); Assert.AreEqual(subtoks[2], "is"); Assert.AreEqual(subtoks[3], "blue"); }
public void TestSimpleSplit() { var subtoks = SubtokenSplitter.SplitSubtokens("someSimpleTest1"); Assert.AreEqual(subtoks[0], "some"); Assert.AreEqual(subtoks[1], "simple"); Assert.AreEqual(subtoks[2], "test"); Assert.AreEqual(subtoks[3], "1"); }
public ClusteringExtractor(HashSet <string> types, TypeConstraints collectedConstraints) { collectedConstraints.RemoveSelfLinks(); _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 10); _lattice = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer); // TODO: Allow external choice of splitting method Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray(); Func <AbstractNode, string[]> charSplitting = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray(); Func <AbstractNode, string[]> bigramSplitting = n => { if (n.Name.Length == 0) { return new string[] { "" } } ; var name = n.Name.ToLower(); return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray()); }; Func <AbstractNode, string[]> subtokenBigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2)); }).ToArray()); }; Func <AbstractNode, string[]> subtokenTrigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { if (sub.Length < 3) { return new string[] { sub } } ; return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3)); }).ToArray()); }; Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray(); _nodeMap = _lattice.Add(collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)). ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))), s => new NodeName(subtokenSplitting(s))); NumRelationships = _lattice.NumRelationships; }
public SubtypeMiner(HashSet <string> types, TypeConstraints collectedConstraints, int maxNumTypes, bool UDTSpecificAnalysis = false, ITypeSymbol t = null) { collectedConstraints.RemoveSelfLinks(); _ciComputer = new SubtokenVariationOfInformationComputer(dirichletAlpha: 2); _lattice = new SuperGreedySplitingVIColoredLattice <NodeName>(_ciComputer); // TODO: Allow external choice of splitting type Func <AbstractNode, string[]> subtokenSplitting = n => SubtokenSplitter.SplitSubtokens(n.Name).ToArray(); Func <AbstractNode, string[]> charSplitting = n => n.Name.ToLower().Select(ch => ch.ToString()).ToArray(); Func <AbstractNode, string[]> bigramSplitting = n => { if (n.Name.Length == 0) { return new string[] { "" } } ; var name = n.Name.ToLower(); return(Enumerable.Range(0, name.Length - 1).Select(i => name.Substring(i, 2)).ToArray()); }; Func <AbstractNode, string[]> subtokenBigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { return Enumerable.Range(0, sub.Length - 1).Select(i => sub.Substring(i, 2)); }).ToArray()); }; Func <AbstractNode, string[]> subtokenTrigramSplitting = n => { return(SubtokenSplitter.SplitSubtokens(n.Name).SelectMany(sub => { if (sub.Length < 3) { return new string[] { sub } } ; return Enumerable.Range(0, sub.Length - 2).Select(i => sub.Substring(i, 3)); }).ToArray()); }; Func <AbstractNode, string[]> trigramAndSubtokenSplitting = n => subtokenSplitting(n).Concat(subtokenTrigramSplitting(n)).ToArray(); if (UDTSpecificAnalysis) { /*IEnumerable<KeyValuePair<AbstractNode, HashSet<AbstractNode>>> nodes = collectedConstraints.AllRelationships.Where(kv => kv.Key.IsSymbol); * var symbols = nodes.Where(kv => (kv.Key as VariableSymbol) !=null);*/ var symbols = GetTypeSpecificRelations(collectedConstraints.AllRelationships, t); _nodeMap = _lattice.Add( symbols.ToDictionary( kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value) ), s => new NodeName(subtokenSplitting(s)) ); } else { var selKeys = collectedConstraints.AllRelationships.Where(kv => types.Contains(kv.Key.Type)); _nodeMap = _lattice.Add(selKeys. ToDictionary(kv => kv.Key, kv => new HashSet <AbstractNode>(kv.Value.Where(n => types.Contains(n.Type)))), s => new NodeName(subtokenSplitting(s))); } NumRelationships = _lattice.NumRelationships; }
public void TestEmpty() { var subtoks = SubtokenSplitter.SplitSubtokens(""); Assert.AreEqual(subtoks.Length, 0); }