示例#1
0
 /// <summary>
 /// Returns the text value of a node
 /// </summary>
 /// <param name="node">HtmlAgilityPack html node</param>
 private string GetNodeValue(HtmlNode node)
 {
     return(UfHelpers.HtmlToText(node, false));
 }
示例#2
0
        private void ParseUfElementValue(HtmlNode baseNode, UfElementDescriber ufElement, UfDataNode ufData)
        {
            // Create a single data node for whatever data insertion is needed.
            UfDataNode ufd = new UfDataNode();

            if (ufElement.CompoundName != string.Empty)
            {
                ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.CompoundName + " ";
            }
            else
            {
                ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.Name + " ";
            }


            ufd.ElementId = GetAttributeValue(baseNode, "id");


            // A parent node in the data schema
            if (ufElement.Elements.Count > 0)
            {
                if (ufElement.CompoundName == string.Empty)
                {
                    // Add a emtpy structural node
                    ufd.Name = ufElement.Name;
                }
                else
                {
                    // This is for compound structures, ie reviewer in hreview is a hcard
                    // Need to find a second attribute value to do this
                    HtmlAttribute att = baseNode.Attributes[ufElement.CompoundAttribute];
                    if (att != null)
                    {
                        if (UfHelpers.FindAttributeValue(att.Value.ToLower(), ufElement.CompoundName))
                        {
                            // Add a emtpy structural node using compound name
                            ufd.Name = ufElement.CompoundName;
                        }
                    }
                }

                // Recursion through the dom structure
                foreach (UfElementDescriber ufChildElement in ufElement.Elements)
                {
                    ParseUfElement(baseNode, ufChildElement, ufd, false);
                }
            }


            // A value needs to be found
            if (ufElement.Type != UfElementDescriber.PropertyTypes.None)
            {
                // Find child nodes with "value" or "value-title" classes
                HtmlNodeCollection valueNodes      = null;
                HtmlNodeCollection valueTitleNodes = null;

                // The value pattern
                if (ufElement.Elements["value"] == null && ufElement.Name != "value")
                {
                    valueNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value ')]");
                }


                // The value-title pattern is only allow for some property types ie dates
                // or name properties ie type, duration, geo, latitude and longitude
                if (ufElement.Type == UfElementDescriber.PropertyTypes.Date ||
                    ufElement.Name == "type" ||
                    ufElement.Name == "duration" ||
                    ufElement.Name == "geo" ||
                    ufElement.Name == "latitude" ||
                    ufElement.Name == "longitude")
                {
                    valueTitleNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value-title ')]");
                }



                if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute || ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag || ufElement.Type == UfElementDescriber.PropertyTypes.UrlText)
                {
                    string text = UfHelpers.HtmlToText(baseNode, false);
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url);
                    string att  = GetAttributeValue(baseNode, ufElement.Attribute);
                    ufd.Name = ufElement.Name;

                    UfDataNode ufd1 = new UfDataNode();
                    UfDataNode ufd2 = new UfDataNode();
                    UfDataNode ufd3 = new UfDataNode();

                    ufd1.Name  = "text";
                    ufd1.Value = text;
                    ufd.Nodes.Add(ufd1);

                    ufd2.Name  = "link";
                    ufd2.Value = link;
                    ufd.Nodes.Add(ufd2);

                    // Add the attribute value used for XFN like structures
                    if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute)
                    {
                        ufd3.Name  = ufElement.Attribute;
                        ufd3.Value = att;
                        ufd.Nodes.Add(ufd3);
                    }

                    // Add the tag element of the url
                    if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag)
                    {
                        ufd3.Name  = "tag";
                        ufd3.Value = UfHelpers.GetTagFromUrl(link);
                        ufd.Nodes.Add(ufd3);
                    }

                    if (ufElement.CompoundName == string.Empty)
                    {
                        ufData.Nodes.Add(ufd);
                    }
                    else
                    {
                        HtmlAttribute att1 = baseNode.Attributes[ufElement.CompoundAttribute];
                        if (att1 != null)
                        {
                            if (UfHelpers.FindAttributeValue(att1.Value.ToLower(), ufElement.CompoundName))
                            {
                                ufd.Name = ufElement.CompoundName;
                                ufData.Nodes.Add(ufd);
                            }
                        }
                    }
                }

                // The value excerpting pattern
                else if (valueNodes != null)
                {
                    string text = string.Empty;
                    foreach (HtmlNode node in valueNodes)
                    {
                        if (node.Name == "img" || node.Name == "area")
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += GetAttributeValue(node, "title").Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += GetAttributeValue(node, "title");
                            }
                        }
                        else if (node.Name == "abbr")
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += GetAttributeValue(node, "title").Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += GetAttributeValue(node, "title");
                            }
                        }
                        else
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += UfHelpers.HtmlToText(node, false).Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += UfHelpers.HtmlToText(node, false) + " ";
                            }
                        }
                    }

                    if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                    {
                        // Take the fagmented bits and create a true ISODateTime string
                        ISODateTime isoDateTime = new ISODateTime();
                        text = isoDateTime.ParseUFFragmented(text);
                    }

                    ufd.Name  = ufElement.Name;
                    ufd.Value = text.Trim();
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // The value-title excerpting pattern
                else if (valueTitleNodes != null)
                {
                    string text = GetAttributeValue(valueTitleNodes[0], "title");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Url from "a" or "link"
                else if ((baseNode.Name == "a" || baseNode.Name == "link") && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                {
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = link;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Url from "img"
                else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                {
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = link;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Email from "a" or "link"
                else if (baseNode.Name == "a" && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Email)
                {
                    string address = UfHelpers.CleanEmailAddress(GetAttributeValue(baseNode, "href"));
                    ufd.Name  = ufElement.Name;
                    ufd.Value = address;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Tel from "object"
                else if (baseNode.Name == "object" && (GetAttributeValue(baseNode, "data") != "") && ufElement.Name == "tel")
                {
                    UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "data"));
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Date from "time"
                else if (baseNode.Name == "time" && GetAttributeValue(baseNode, "datetime") != "" && ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                {
                    string text = GetAttributeValue(baseNode, "datetime");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Date from "abbr"
                else if (baseNode.Name == "abbr" && GetAttributeValue(baseNode, "title") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                {
                    string text = GetAttributeValue(baseNode, "title");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "abbr"
                else if (baseNode.Name == "abbr" || baseNode.Name == "acronym" && GetAttributeValue(baseNode, "title") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "title");
                    ufd.Name = ufElement.Name;

                    // This is for geo been used as a location in hcalandar
                    if (ufElement.CompoundName != string.Empty)
                    {
                        ufd.Name = ufElement.CompoundName;
                    }

                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "input"
                else if (baseNode.Name == "input" && GetAttributeValue(baseNode, "value") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "value");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Tel from "area"
                else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != "") && ufElement.Name == "tel")
                {
                    UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "href"));
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text and url from "area"
                else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != string.Empty || GetAttributeValue(baseNode, "alt") != string.Empty))
                {
                    if ((ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Email) && GetAttributeValue(baseNode, "href") != string.Empty)
                    {
                        string text = GetAttributeValue(baseNode, "href");

                        if (ufElement.Type == UfElementDescriber.PropertyTypes.Email)
                        {
                            text = UfHelpers.CleanEmailAddress(text);
                        }

                        if (ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                        {
                            text = UfHelpers.GetAbsoluteUrl(text, this.baseUrl, url);
                        }

                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                    else if (GetAttributeValue(baseNode, "alt") != string.Empty)
                    {
                        string text = GetAttributeValue(baseNode, "alt");
                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                }

                // Url/Image from "object"
                else if (baseNode.Name == "object" && GetAttributeValue(baseNode, "data") != string.Empty && (ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Image))
                {
                    string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "data"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Image from "img" or "area"
                else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Image)
                {
                    string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "img" longdesc attribute
                else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "longdesc") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "longdesc");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);;
                }


                // Text from "img" alt attribute
                else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "alt") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "alt");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }


                // Text for type/value structures with no found children
                else if (ufElement.NodeType == UfElementDescriber.StructureTypes.TypeValuePair)
                {
                    // if no chidren nodes ie type/value are found use text
                    // the calls for a children node type and value are alway both thier parent
                    if (ufd.Nodes.Count == 0)
                    {
                        // Add text from node value
                        string text = UfHelpers.HtmlToText(baseNode, false);
                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                    else
                    {
                        // Add child type/value pair
                        ufd.Name = ufElement.Name;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                }

                // Text from Html node collect
                else if (ufElement.Type == UfElementDescriber.PropertyTypes.FormattedText)
                {
                    string text = UfHelpers.HtmlToText(baseNode, true);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                else
                {
                    // Text from node value
                    //string text = FindValuePattern(baseNode, ufElement);
                    //if(text == string.Empty)
                    //    text = HtmlToText(baseNode, false);

                    string text = UfHelpers.HtmlToText(baseNode, false);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }
            }
            else
            {
                AddNewDateNode(baseNode, ufData, ufd, ufElement);
            }
        }