private void ComputeMatchForLabel(List <TNode> s1, List <TNode> s2, int tiedToAncestor, double maxAcceptableDistance) { // Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will // be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity // to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2 // with the given label. // Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node. // So in the case of totally matching sequences, we process them in O(n) - // both node1 and firstNonMatch2 will be advanced simultaneously. int count1 = s1.Count; int count2 = s2.Count; int firstNonMatch2 = 0; for (int i1 = 0; i1 < count1; i1++) { TNode node1 = s1[i1]; // Skip this guy if it already has a partner if (HasPartnerInTree2(node1)) { continue; } // Find node2 that matches node1 the best, i.e. has minimal distance. double bestDistance = MaxDistance; TNode bestMatch = default(TNode); bool matched = false; int i2; for (i2 = firstNonMatch2; i2 < count2; i2++) { TNode node2 = s2[i2]; // Skip this guy if it already has a partner if (HasPartnerInTree1(node2)) { continue; } // this requires parents to be processed before their children: if (tiedToAncestor > 0) { // TODO (tomat): For nodes tied to their parents, // consider avoding matching them to all other nodes of the same label. // Rather we should only match them with their siblings that share the same parent. var ancestor1 = comparer.GetAncestor(node1, tiedToAncestor); var ancestor2 = comparer.GetAncestor(node2, tiedToAncestor); Debug.Assert(comparer.GetLabel(ancestor1) < comparer.GetLabel(node1)); if (!Contains(ancestor1, ancestor2)) { continue; } } // We know that // 1. (node1, node2) not in M // 2. Both of their parents are matched to the same parent (or are not matched) // // Now, we have no other choice than comparing the node "values" // and looking for the one with the smaller distance. double distance = comparer.GetDistance(node1, node2); if (distance < bestDistance) { matched = true; bestMatch = node2; bestDistance = distance; // We only stop if we've got an exact match. This is to resolve the problem // of entities with identical names(name is often used as the "value" of a // node) but with different "sub-values" (e.g. two locals may have the same name // but different types. Since the type is not part of the value, we don't want // to stop looking for the best match if we don't have an exact match). if (distance == ExactMatchDistance) { break; } } } if (matched && bestDistance <= maxAcceptableDistance) { Add(node1, bestMatch); // If we exactly matched to firstNonMatch2 we can advance it. if (i2 == firstNonMatch2) { firstNonMatch2 = i2 + 1; } } } }
private void ComputeMatchForLabel( List <TNode> s1, List <TNode> s2, int tiedToAncestor, double maxAcceptableDistance ) { // Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will // be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity // to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2 // with the given label. // Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node. // So in the case of totally matching sequences, we process them in O(n) - // both node1 and firstNonMatch2 will be advanced simultaneously. Debug.Assert( maxAcceptableDistance >= ExactMatchDistance && maxAcceptableDistance <= MaxDistance ); var count1 = s1.Count; var count2 = s2.Count; var firstNonMatch2 = 0; for (var i1 = 0; i1 < count1; i1++) { var node1 = s1[i1]; // Skip this guy if it already has a partner if (HasPartnerInTree2(node1)) { continue; } // Find node2 that matches node1 the best, i.e. has minimal distance. var bestDistance = MaxDistance * 2; TNode bestMatch = default; var matched = false; int i2; for (i2 = firstNonMatch2; i2 < count2; i2++) { var node2 = s2[i2]; // Skip this guy if it already has a partner if (HasPartnerInTree1(node2)) { continue; } // this requires parents to be processed before their children: if (tiedToAncestor > 0) { // TODO (tomat): For nodes tied to their parents, // consider avoiding matching them to all other nodes of the same label. // Rather we should only match them with their siblings that share the same parent. // Check if nodes that are configured to be tied to their ancestor have the respective ancestor matching. // In cases when we compare substrees rooted below both of these ancestors we assume the ancestors are // matching since the roots of the subtrees must match and therefore their ancestors must match as well. // If one node's ancestor is present in the subtree and the other isn't then we are not in the scenario // of comparing subtrees with matching roots and thus we consider the nodes not matching. var hasAncestor1 = _comparer.TryGetAncestor( node1, tiedToAncestor, out var ancestor1 ); var hasAncestor2 = _comparer.TryGetAncestor( node2, tiedToAncestor, out var ancestor2 ); if (hasAncestor1 != hasAncestor2) { continue; } if (hasAncestor1) { // Since CategorizeNodesByLabels added nodes to the s1/s2 lists in depth-first prefix order, // we can also accept equality in the following condition. That's because we find the partner // of the parent node before we get to finding it for the child node of the same kind. Debug.Assert( _comparer.GetLabel(ancestor1) <= _comparer.GetLabel(node1) ); if (!Contains(ancestor1, ancestor2)) { continue; } } } // We know that // 1. (node1, node2) not in M // 2. Both of their parents are matched to the same parent (or are not matched) // // Now, we have no other choice than comparing the node "values" // and looking for the one with the smaller distance. var distance = _comparer.GetDistance(node1, node2); if (distance < bestDistance) { matched = true; bestMatch = node2; bestDistance = distance; // We only stop if we've got an exact match. This is to resolve the problem // of entities with identical names(name is often used as the "value" of a // node) but with different "sub-values" (e.g. two locals may have the same name // but different types. Since the type is not part of the value, we don't want // to stop looking for the best match if we don't have an exact match). if (distance == ExactMatchDistance) { break; } } } if (matched && bestDistance <= maxAcceptableDistance) { var added = TryAdd(node1, bestMatch); // We checked above that node1 doesn't have a partner. // The map is a bijection by construction, so we should be able to add the mapping. Debug.Assert(added); // If we exactly matched to firstNonMatch2 we can advance it. if (i2 == firstNonMatch2) { firstNonMatch2 = i2 + 1; } if (firstNonMatch2 == count2) { return; } } } }