/// <summary> /// Returns the text value of a node /// </summary> /// <param name="node">HtmlAgilityPack html node</param> private string GetNodeValue(HtmlNode node) { return(UfHelpers.HtmlToText(node, false)); }
private void ParseUfElementValue(HtmlNode baseNode, UfElementDescriber ufElement, UfDataNode ufData) { // Create a single data node for whatever data insertion is needed. UfDataNode ufd = new UfDataNode(); if (ufElement.CompoundName != string.Empty) { ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.CompoundName + " "; } else { ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.Name + " "; } ufd.ElementId = GetAttributeValue(baseNode, "id"); // A parent node in the data schema if (ufElement.Elements.Count > 0) { if (ufElement.CompoundName == string.Empty) { // Add a emtpy structural node ufd.Name = ufElement.Name; } else { // This is for compound structures, ie reviewer in hreview is a hcard // Need to find a second attribute value to do this HtmlAttribute att = baseNode.Attributes[ufElement.CompoundAttribute]; if (att != null) { if (UfHelpers.FindAttributeValue(att.Value.ToLower(), ufElement.CompoundName)) { // Add a emtpy structural node using compound name ufd.Name = ufElement.CompoundName; } } } // Recursion through the dom structure foreach (UfElementDescriber ufChildElement in ufElement.Elements) { ParseUfElement(baseNode, ufChildElement, ufd, false); } } // A value needs to be found if (ufElement.Type != UfElementDescriber.PropertyTypes.None) { // Find child nodes with "value" or "value-title" classes HtmlNodeCollection valueNodes = null; HtmlNodeCollection valueTitleNodes = null; // The value pattern if (ufElement.Elements["value"] == null && ufElement.Name != "value") { valueNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value ')]"); } // The value-title pattern is only allow for some property types ie dates // or name properties ie type, duration, geo, latitude and longitude if (ufElement.Type == UfElementDescriber.PropertyTypes.Date || ufElement.Name == "type" || ufElement.Name == "duration" || ufElement.Name == "geo" || ufElement.Name == "latitude" || ufElement.Name == "longitude") { valueTitleNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value-title ')]"); } if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute || ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag || ufElement.Type == UfElementDescriber.PropertyTypes.UrlText) { string text = UfHelpers.HtmlToText(baseNode, false); string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url); string att = GetAttributeValue(baseNode, ufElement.Attribute); ufd.Name = ufElement.Name; UfDataNode ufd1 = new UfDataNode(); UfDataNode ufd2 = new UfDataNode(); UfDataNode ufd3 = new UfDataNode(); ufd1.Name = "text"; ufd1.Value = text; ufd.Nodes.Add(ufd1); ufd2.Name = "link"; ufd2.Value = link; ufd.Nodes.Add(ufd2); // Add the attribute value used for XFN like structures if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute) { ufd3.Name = ufElement.Attribute; ufd3.Value = att; ufd.Nodes.Add(ufd3); } // Add the tag element of the url if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag) { ufd3.Name = "tag"; ufd3.Value = UfHelpers.GetTagFromUrl(link); ufd.Nodes.Add(ufd3); } if (ufElement.CompoundName == string.Empty) { ufData.Nodes.Add(ufd); } else { HtmlAttribute att1 = baseNode.Attributes[ufElement.CompoundAttribute]; if (att1 != null) { if (UfHelpers.FindAttributeValue(att1.Value.ToLower(), ufElement.CompoundName)) { ufd.Name = ufElement.CompoundName; ufData.Nodes.Add(ufd); } } } } // The value excerpting pattern else if (valueNodes != null) { string text = string.Empty; foreach (HtmlNode node in valueNodes) { if (node.Name == "img" || node.Name == "area") { if (ufElement.Type == UfElementDescriber.PropertyTypes.Date) { text += GetAttributeValue(node, "title").Replace(" ", "") + " "; } else { text += GetAttributeValue(node, "title"); } } else if (node.Name == "abbr") { if (ufElement.Type == UfElementDescriber.PropertyTypes.Date) { text += GetAttributeValue(node, "title").Replace(" ", "") + " "; } else { text += GetAttributeValue(node, "title"); } } else { if (ufElement.Type == UfElementDescriber.PropertyTypes.Date) { text += UfHelpers.HtmlToText(node, false).Replace(" ", "") + " "; } else { text += UfHelpers.HtmlToText(node, false) + " "; } } } if (ufElement.Type == UfElementDescriber.PropertyTypes.Date) { // Take the fagmented bits and create a true ISODateTime string ISODateTime isoDateTime = new ISODateTime(); text = isoDateTime.ParseUFFragmented(text); } ufd.Name = ufElement.Name; ufd.Value = text.Trim(); AddNewDateNode(baseNode, ufData, ufd, ufElement); } // The value-title excerpting pattern else if (valueTitleNodes != null) { string text = GetAttributeValue(valueTitleNodes[0], "title"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Url from "a" or "link" else if ((baseNode.Name == "a" || baseNode.Name == "link") && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url) { string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url); ufd.Name = ufElement.Name; ufd.Value = link; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Url from "img" else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url) { string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url); ufd.Name = ufElement.Name; ufd.Value = link; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Email from "a" or "link" else if (baseNode.Name == "a" && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Email) { string address = UfHelpers.CleanEmailAddress(GetAttributeValue(baseNode, "href")); ufd.Name = ufElement.Name; ufd.Value = address; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Tel from "object" else if (baseNode.Name == "object" && (GetAttributeValue(baseNode, "data") != "") && ufElement.Name == "tel") { UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "data")); AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Date from "time" else if (baseNode.Name == "time" && GetAttributeValue(baseNode, "datetime") != "" && ufElement.Type == UfElementDescriber.PropertyTypes.Date) { string text = GetAttributeValue(baseNode, "datetime"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Date from "abbr" else if (baseNode.Name == "abbr" && GetAttributeValue(baseNode, "title") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Date) { string text = GetAttributeValue(baseNode, "title"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Text from "abbr" else if (baseNode.Name == "abbr" || baseNode.Name == "acronym" && GetAttributeValue(baseNode, "title") != string.Empty) { string text = GetAttributeValue(baseNode, "title"); ufd.Name = ufElement.Name; // This is for geo been used as a location in hcalandar if (ufElement.CompoundName != string.Empty) { ufd.Name = ufElement.CompoundName; } ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Text from "input" else if (baseNode.Name == "input" && GetAttributeValue(baseNode, "value") != string.Empty) { string text = GetAttributeValue(baseNode, "value"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Tel from "area" else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != "") && ufElement.Name == "tel") { UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "href")); AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Text and url from "area" else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != string.Empty || GetAttributeValue(baseNode, "alt") != string.Empty)) { if ((ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Email) && GetAttributeValue(baseNode, "href") != string.Empty) { string text = GetAttributeValue(baseNode, "href"); if (ufElement.Type == UfElementDescriber.PropertyTypes.Email) { text = UfHelpers.CleanEmailAddress(text); } if (ufElement.Type == UfElementDescriber.PropertyTypes.Url) { text = UfHelpers.GetAbsoluteUrl(text, this.baseUrl, url); } ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } else if (GetAttributeValue(baseNode, "alt") != string.Empty) { string text = GetAttributeValue(baseNode, "alt"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } } // Url/Image from "object" else if (baseNode.Name == "object" && GetAttributeValue(baseNode, "data") != string.Empty && (ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Image)) { string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "data"), this.baseUrl, url); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Image from "img" or "area" else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Image) { string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Text from "img" longdesc attribute else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "longdesc") != string.Empty) { string text = GetAttributeValue(baseNode, "longdesc"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement);; } // Text from "img" alt attribute else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "alt") != string.Empty) { string text = GetAttributeValue(baseNode, "alt"); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } // Text for type/value structures with no found children else if (ufElement.NodeType == UfElementDescriber.StructureTypes.TypeValuePair) { // if no chidren nodes ie type/value are found use text // the calls for a children node type and value are alway both thier parent if (ufd.Nodes.Count == 0) { // Add text from node value string text = UfHelpers.HtmlToText(baseNode, false); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } else { // Add child type/value pair ufd.Name = ufElement.Name; AddNewDateNode(baseNode, ufData, ufd, ufElement); } } // Text from Html node collect else if (ufElement.Type == UfElementDescriber.PropertyTypes.FormattedText) { string text = UfHelpers.HtmlToText(baseNode, true); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } else { // Text from node value //string text = FindValuePattern(baseNode, ufElement); //if(text == string.Empty) // text = HtmlToText(baseNode, false); string text = UfHelpers.HtmlToText(baseNode, false); ufd.Name = ufElement.Name; ufd.Value = text; AddNewDateNode(baseNode, ufData, ufd, ufElement); } } else { AddNewDateNode(baseNode, ufData, ufd, ufElement); } }