/// <summary> /// Constructor. /// </summary> /// <param name="node">The respective System.Xml.XmlNode instance that this XNode maps to from the original XmlDocument.</param> /// <param name="parent">The parent XNode of this node in the XML Hierarchy</param> public XNode(XmlNode node, XNode parent) { this.XmlNode = node; this.Parent = parent; this.Attributes = Empty; this.Children = Empty; this.Elements = Empty; this.Texts = Empty; }
/// <summary> /// Compute the minimal editing distance between two lists of elements /// </summary> private static int DistanceMatchList(XNode[] nodes1, XNode[] nodes2, bool treeOrder) { var distance = new int[nodes1.Length + 1, nodes2.Length + 1]; var matching1 = new int[nodes1.Length]; var matching2 = new int[nodes2.Length]; // Insert cost. for (int i = 0; i < nodes2.Length; i++) distance[nodes1.Length, i] = nodes2[i].GetDescendantCount() + 1; for (int i = 0; i < nodes1.Length; i++) { // delete cost. int deleteCost = nodes1[i].GetDescendantCount() + 1; distance[i, nodes2.Length] = deleteCost; for (int j = 0; j < nodes2.Length; j++) { var dist = treeOrder ? Distance(nodes1[i], nodes2[j], true, NoConnection) : Distance(nodes2[j], nodes1[i], true, NoConnection); if (dist < NoConnection) { var key = treeOrder ? new Tuple<XNode, XNode>(nodes1[i], nodes2[j]) : new Tuple<XNode, XNode>(nodes2[j], nodes1[i]); distanceLookup[key] = dist; } distance[i, j] = dist; } } // compute the minimal cost matching. return FindMinimalMatching(distance, matching1, matching2); }
/// <summary> /// Calculate the editing distance between two elements, up to a maximum threshold. /// </summary> /// <returns>The minimal editing distance</returns> private static int DistanceElements(XNode node1, XNode node2, int threshold) { int dist = 0; // Distance of attributes. if (node1.Attributes.Length == 0) dist = node2.Attributes.Length * 2; else if (node2.Attributes.Length == 0) dist = node1.Attributes.Length * 2; else dist = DistanceAttributes(node1.Attributes, node2.Attributes); // Match second level nodes first. if (node1.Children.Length == 0) { foreach(var child2 in node2.Children) dist += child2.GetDescendantCount() + 1; } else if (node2.Children.Length == 0) { foreach (var child1 in node1.Children) dist += child1.GetDescendantCount() + 1; } else if (node1.Children.Length == 1 && node2.Children.Length == 1) { var child1 = node1.Children[0]; var child2 = node2.Children[0]; if (child1.HashEquals(child2.Hash)) return dist; var isElement1 = child1.IsElement(); var isElement2 = child2.IsElement(); if (isElement1 && isElement2) { if (child1.Name == child2.Name) dist += DistanceElements(child1, child1, threshold - dist); else dist += child1.GetDescendantCount() + child2.GetDescendantCount() + 2; } else if (!isElement1 && !isElement2) dist++; else dist += child1.GetDescendantCount() + child2.GetDescendantCount() + 2; } else { // Match text nodes. if (node1.Texts.Length == 0) dist += node1.Texts.Length; else if (node2.Texts.Length == 0) dist += node2.Texts.Length; else dist += DistanceTexts(node1.Texts, node2.Texts); var elementCount1 = node1.Elements.Length; var elementCount2 = node2.Elements.Length; var matched1 = new bool[elementCount1]; var matched2 = new bool[elementCount2]; int matched = MatchFilter(node1.Elements, node2.Elements, matched1, matched2); if (elementCount1 == matched && elementCount2 == matched) return dist; if (elementCount1 == matched) { for (int i = 0; i < elementCount2; i++) { if (!matched2[i]) dist += node2.Elements[i].GetDescendantCount() + 1; } return dist; } if (elementCount2 == matched) { for (int i = 0; i < elementCount1; i++) { if (!matched1[i]) dist += node1.Elements[i].GetDescendantCount() + 1; } return dist; } // 'Match' remaining unmatched child elements nodes. int remaining1 = node1.Elements.Length - matched; int remaining2 = node2.Elements.Length - matched; int matchCount1 = 0; int matchCount2 = 0; while ((matchCount1 < remaining1) && (matchCount2 < remaining2)) { var unmatched1 = new List<XNode>(); var unmatched2 = new List<XNode>(); string name = null; // Find and group unmatched elements by their name foreach (var child1 in node1.Elements) { if (child1.Matching == null && child1.Match != MatchType.NoMatch) { if (name == null) name = child1.Name; if (name == child1.Name) { unmatched1.Add(child1); matchCount1++; } } } // Find unmatched nodes in the other subtree with the same element name foreach (var child2 in node2.Elements) { if (child2.Matching == null && child2.Match != MatchType.NoMatch) { if (name == child2.Name) { unmatched2.Add(child2); matchCount2++; } } } if (unmatched2.Count == 0) { for (int i = 0; i < unmatched2.Count; i++) dist += unmatched2[i].GetDescendantCount(); } else { // To find minimal-cost matching between those unmatched elements dist += (unmatched1.Count >= unmatched2.Count) ? DistanceMatchList(unmatched1.ToArray(), unmatched2.ToArray(), true) : DistanceMatchList(unmatched2.ToArray(), unmatched1.ToArray(), false); } } if (matchCount1 < remaining1) { for (int i = 0; i < elementCount1; i++) { if (!matched1[i]) dist += node1.Elements[i].GetDescendantCount(); } } else if (matchCount2 < remaining2) { for (int i = 0; i < elementCount2; i++) { if (!matched2[i]) dist += node2.Elements[i].GetDescendantCount(); } } } if (dist < threshold) return dist; return NoConnection; }
/// <summary> /// Calculate the editing distance between two lists of attributes /// </summary> private static int DistanceAttributes(XNode[] attributes1, XNode[] attributes2) { if (attributes1.Length == 1 && attributes2.Length == 1) { if (attributes1[0].HashEquals(attributes2[0].Hash)) return 0; return (attributes1[0].Name == attributes2[0].Name) ? 1 : 2; } var dist = 0; var matched = 0; var matching = new bool[attributes2.Length]; for (int i = 0; i < attributes1.Length; i++) { var found = false; for (int j = 0; j < attributes2.Length; j++) { if (matching[j]) continue; else if (attributes1[i].HashEquals(attributes2[j].Hash)) { matching[j] = true; found = true; matched++; break; } else if (attributes1[i].Name == attributes2[j].Name) { matching[j] = true; dist++; found = true; matched++; break; } } if (!found) dist += 2; } dist += (attributes2.Length - matched) * 2; return dist; }
/// <summary> /// Calculate the editing distance between the two nodes (with caching) /// </summary> /// <returns>The minimal editing distance</returns> private static int Distance(XNode node1, XNode node2, bool toRecord, int threshold) { var isElement1 = node1.IsElement(); var isElement2 = node2.IsElement(); if (isElement1 && isElement2) { if (node1.Name != node2.Name) return NoConnection; int dist = DistanceElements(node1, node2, threshold); if (toRecord && (dist < NoConnection)) distanceLookup[new Tuple<XNode, XNode>(node1, node2)] = dist; return dist; } if (!isElement1 && !isElement2) return 1; return NoConnection; }
/// <summary> /// Diff and match two lists of text nodes /// </summary> private static void DiffTexts(XNode[] texts1, XNode[] texts2) { // First, try matching exactly equal text nodes var matched = MatchEqual(texts1, texts2, MatchType.Match); // Randomly match any remaining unmatched text1 nodes with any remaining unmatched text2 nodes if (matched < texts1.Length && texts1.Length <= texts2.Length) matched += MatchAny(texts1, texts2, MatchType.Change); else if (matched < texts2.Length && texts2.Length <= texts1.Length) matched += MatchAny(texts2, texts1, MatchType.Change); // Finally, set any remaining text nodes as unmatched if (matched < texts1.Length) SetUnmatched(texts1, MatchType.NoMatch); else if (matched < texts2.Length) SetUnmatched(texts2, MatchType.NoMatch); }
public static XNode Build(XmlNode node, XNode parent) { if (node.NodeType == XmlNodeType.Attribute) { var xnode = new XNode(node, parent); xnode.Name = node.Name.ToLowerInvariant(); var name = "@" + xnode.Name; xnode.Hash = Murmur3Hasher.HashString(name + "/" + (node.Value ?? string.Empty)); return xnode; } else if (node.NodeType == XmlNodeType.Text || node.NodeType == XmlNodeType.CDATA) { var xnode = new XNode(node, parent); xnode.Name = "#text"; xnode.Hash = Murmur3Hasher.HashString(xnode.Name + "/" + (node.Value ?? string.Empty)); return xnode; } else if (node.NodeType == XmlNodeType.Element) { var xnode = new XNode(node, parent); var name = node.Name.ToLowerInvariant(); xnode.Name = name; var hashes = new List<byte[]>(); hashes.Add(Murmur3Hasher.HashString(name + "/")); // Add attributes var attributes = new List<XNode>(); for (var i = 0; i < node.Attributes.Count; i++) { var child = XNode.Build(node.Attributes[i], xnode); if (child != null) { hashes.Add(child.Hash); attributes.Add(child); } } xnode.Attributes = attributes.ToArray(); // Add child elements and text nodes var children = new List<XNode>(); var elements = new List<XNode>(); var texts = new List<XNode>(); for (var i = 0; i < node.ChildNodes.Count; i++) { var child = XNode.Build(node.ChildNodes[i], xnode); if (child != null) { hashes.Add(child.Hash); children.Add(child); if (child.IsElement()) elements.Add(child); else texts.Add(child); } } xnode.Children = children.ToArray(); xnode.Elements = elements.ToArray(); xnode.Texts = texts.ToArray(); // Sort and concatenate child hashes and then compute the hash var joined = ConcatAll(hashes.OrderBy(h => h, ByteArrayComparer.Instance) .ToList(), Murmur3Hasher.OUTPUT_LENGTH); xnode.Hash = Murmur3Hasher.HashBytes(joined); return xnode; } return null; }
/// <summary> /// Set match for child nodes with equal hash values (equal sub-trees) /// </summary> private static int MatchEqual(XNode[] nodes1, XNode[] nodes2, MatchType match) { var matched = 0; foreach (var node1 in nodes1) { foreach (var node2 in nodes2) { if (node2.Matching == null && node2.Match != MatchType.NoMatch && node1.HashEquals(node2.Hash)) { SetMatching(node1, node2, MatchType.Match); matched++; break; } } if (matched == nodes2.Length) break; } return matched; }
/// <summary> /// Set the match for the given nodes if they do not have a matching node. /// </summary> /// <returns>The number of unmatched nodes that were updated</returns> private static int SetUnmatched(XNode[] nodes, MatchType match) { var count = 0; foreach (var node in nodes) { if (node.Matching == null && node.Match != MatchType.NoMatch) { node.Match = match; count++; } } return count; }
/// <summary> /// Set the match for the given nodes. /// </summary> private static void SetMatching(XNode[] nodes, MatchType match) { for (var i = 0; i < nodes.Length; i++) nodes[i].Match = match; }
/// <summary> /// Set the match for the given nodes to each other. /// </summary> private static void SetMatching(XNode node1, XNode node2, MatchType match) { node1.Match = match; node2.Match = match; node1.Matching = node2; node2.Matching = node1; }
/// <summary> /// Set the match for the given node /// </summary> private static void SetMatching(XNode node1, MatchType match) { node1.Match = match; }
/// <summary> /// Find minimal cost matching between two node lists; Record the matching info back to the trees. /// </summary> private static void MatchList(XNode[] nodes1, XNode[] nodes2, bool treeOrder) { var distance = new int[nodes1.Length + 1, nodes2.Length + 1]; // Calculate insert cost. for (int i = 0; i < nodes2.Length; i++) distance[nodes1.Length, i] = nodes2[i].GetDescendantCount() + 1; for (int i = 0; i < nodes1.Length; i++) { // Calculate delete cost var deleteCost = nodes1[i].GetDescendantCount() + 1; distance[i, nodes2.Length] = deleteCost; for (int j = 0; j < nodes2.Length; j++) { int dist = 0; dist = treeOrder ? Distance(nodes1[i], nodes2[j], true, NoConnection) : Distance(nodes2[j], nodes1[i], true, NoConnection); if (dist < NoConnection) { var key = treeOrder ? new Tuple<XNode, XNode>(nodes1[i], nodes2[j]) : new Tuple<XNode, XNode>(nodes2[j], nodes1[i]); distanceLookup[key] = dist; } distance[i, j] = dist; } } // compute the minimal cost matching. var matching1 = new int[nodes1.Length]; var matching2 = new int[nodes2.Length]; FindMinimalMatching(distance, matching1, matching2); for (int i = 0; i < matching1.Length; i++) { if (matching1[i] == NoMatch) SetMatching(nodes1[i], MatchType.NoMatch); else SetMatching(nodes1[i], nodes2[matching1[i]], MatchType.Change); } for (int i = 0; i < matching2.Length; i++) { if (matching2[i] == NoMatch) SetMatching(nodes2[i], MatchType.NoMatch); else SetMatching(nodes2[i], nodes1[matching2[i]], MatchType.Change); } for (int i = 0; i < matching1.Length; i++) { if (matching1[i] != NoMatch) { var node1 = nodes1[i]; var node2 = nodes2[matching1[i]]; if (node1.IsElement() && node2.IsElement()) { if (treeOrder) DiffElements(node1, node2); else DiffElements(node2, node1); } } } }
/// <summary> /// Compute the editing distance between two groups of text nodes /// </summary> private static int DistanceTexts(XNode[] texts1, XNode[] texts2) { var matched = 0; var matching = new bool[texts2.Length]; for (int i = 0; i < texts1.Length; i++) { for (int j = 0; j < texts2.Length; j++) { if (!matching[j] && texts1[i].HashEquals(texts2[j].Hash)) { matching[j] = true; matched++; break; } } if (matched == texts2.Length) break; } return texts1.Length >= texts2.Length ? texts1.Length - matched : texts2.Length - matched; }
/// <summary> /// Diff and match two lists of attributes /// </summary> private static void DiffAttributes(XNode[] attributes1, XNode[] attributes2) { // If only one attribute in both nodes if ((attributes1.Length == 1) && (attributes2.Length == 1)) { if (attributes1[0].HashEquals(attributes2[0].Hash)) return; if (attributes1[0].Name == attributes2[0].Name) { SetMatching(attributes1[0], attributes2[0], MatchType.Change); return; } SetMatching(attributes1[0], attributes2[0], MatchType.NoMatch); return; } // Try and match every attribute in node1 with each attribute in node2 var matched = 0; foreach (var attr1 in attributes1) { var found = false; foreach (var attr2 in attributes2) { if (attr2.Matching != null) continue; if (attr2.HashEquals(attr1.Hash)) { SetMatching(attr1, attr2, MatchType.Match); matched++; found = true; break; } if (attr2.Name == attr1.Name) { SetMatching(attr1, attr2, MatchType.Change); matched++; found = true; break; } } if (!found) attr1.Match = MatchType.NoMatch; } // If node2 has more attributes if (matched != attributes2.Length) SetUnmatched(attributes2, MatchType.NoMatch); }
/// <summary> /// Randomly match nodes any nodes that are unmatched with other unmatched nodes /// </summary> private static int MatchAny(XNode[] nodes1, XNode[] nodes2, MatchType match) { var matched = 0; foreach (var node1 in nodes1) { if (node1.Matching == null && node1.Match != MatchType.NoMatch) { foreach (var node2 in nodes2) { if (node2.Matching == null && node2.Match != MatchType.NoMatch) { SetMatching(node1, node2, match); matched++; break; } } } } return matched; }
/// <summary> /// Compare and match the two nodes (and their children). /// </summary> private static void DiffElements(XNode node1, XNode node2) { // Attributes if (node1.Attributes.Length > 0) { if (node2.Attributes.Length > 0) DiffAttributes(node1.Attributes, node2.Attributes); else SetMatching(node1.Attributes, MatchType.NoMatch); } else if (node2.Attributes.Length > 0) { SetMatching(node2.Attributes, MatchType.NoMatch); } // Children = Elements and Text // First, if no children if (node1.Children.Length == 0) SetMatching(node2.Children, MatchType.NoMatch); else if (node2.Children.Length == 0) SetMatching(node1.Children, MatchType.NoMatch); // Next, if one child each else if (node2.Children.Length == 1 && node1.Children.Length == 1) { var child1 = node1.Children[0]; var child2 = node2.Children[0]; if (child1.HashEquals(child2.Hash)) return; var isElement1 = child1.IsElement(); var isElement2 = child2.IsElement(); if (isElement1 && isElement2) { if (child1.Name == child2.Name) { SetMatching(child1, child2, MatchType.Change); DiffElements(child1, child2); } else SetMatching(child1, child2, MatchType.NoMatch); } else if (!isElement1 && !isElement2) SetMatching(child1, child2, MatchType.Change); else SetMatching(child1, child2, MatchType.NoMatch); } // Then, if many children else { // Match text nodes if (node1.Texts.Length > 0) { if (node2.Texts.Length > 0) DiffTexts(node1.Texts, node2.Texts); else SetMatching(node1.Texts, MatchType.NoMatch); } else if (node2.Texts.Length > 0) SetMatching(node2.Texts, MatchType.NoMatch); // Match element nodes with equal hashes var matched = MatchEqual(node1.Elements, node2.Elements, MatchType.Match); if (matched == node1.Elements.Length && matched == node2.Elements.Length) return; if (matched == node1.Elements.Length) SetUnmatched(node2.Elements, MatchType.NoMatch); if (matched == node2.Elements.Length) SetUnmatched(node1.Elements, MatchType.NoMatch); // 'Match' remaining unmatched child elements nodes. int remaining1 = node1.Elements.Length - matched; int remaining2 = node2.Elements.Length - matched; int matchCount1 = 0; int matchCount2 = 0; while ((matchCount1 < remaining1) && (matchCount2 < remaining2)) { var unmatched1 = new List<XNode>(); var unmatched2 = new List<XNode>(); string name = null; // Find and group unmatched elements by their name foreach (var child1 in node1.Elements) { if (child1.Matching == null && child1.Match != MatchType.NoMatch) { if (name == null) name = child1.Name; if (name == child1.Name) { unmatched1.Add(child1); matchCount1++; } } } // Find unmatched nodes in the other subtree with the same element name foreach (var child2 in node2.Elements) { if (child2.Matching == null && child2.Match != MatchType.NoMatch) { if (name == child2.Name) { unmatched2.Add(child2); matchCount2++; } } } if (unmatched2.Count == 0) SetMatching(unmatched1, MatchType.NoMatch); else { if ((unmatched1.Count == 1) && (unmatched2.Count == 1)) { SetMatching(unmatched1[0], unmatched2[0], MatchType.Change); DiffElements(unmatched1[0], unmatched2[0]); } // Find minimal-cost matching between those unmatched else if (unmatched1.Count >= unmatched2.Count) MatchList(unmatched1.ToArray(), unmatched2.ToArray(), true); else MatchList(unmatched2.ToArray(), unmatched1.ToArray(), false); } } // Finally mark any remaining child elements as unmatched if (matchCount1 < remaining1) SetUnmatched(node1.Elements, MatchType.NoMatch); else if (matchCount2 < remaining2) SetUnmatched(node2.Elements, MatchType.NoMatch); } }
/// <summary> /// Filter out matched elements (equal hashes). /// </summary> /// <returns>The number of matched nodes</returns> private static int MatchFilter(XNode[] elements1, XNode[] elements2, bool[] matched1, bool[] matched2) { int matched = 0; for (int i = 0; i < elements2.Length; i++) { for (int j = 0; j < elements1.Length; j++) { if (!matched1[j] && !matched2[i] && elements1[j].HashEquals(elements2[i].Hash)) { matched1[j] = true; matched2[i] = true; matched++; break; } } } return matched; }
public XTree(XmlDocument document) { Document = document; Root = XNode.Build(document.DocumentElement, null); }