Пример #1
0
 /// <summary>
 /// Returns the text value of a node
 /// </summary>
 /// <param name="node">HtmlAgilityPack html node</param>
 private string GetNodeValue(HtmlNode node)
 {
     return(UfHelpers.HtmlToText(node, false));
 }
Пример #2
0
        private void ParseUfElementValue(HtmlNode baseNode, UfElementDescriber ufElement, UfDataNode ufData)
        {
            // Create a single data node for whatever data insertion is needed.
            UfDataNode ufd = new UfDataNode();

            if (ufElement.CompoundName != string.Empty)
            {
                ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.CompoundName + " ";
            }
            else
            {
                ufd.ParentNodeNames = ufData.ParentNodeNames + ufElement.Name + " ";
            }


            ufd.ElementId = GetAttributeValue(baseNode, "id");


            // A parent node in the data schema
            if (ufElement.Elements.Count > 0)
            {
                if (ufElement.CompoundName == string.Empty)
                {
                    // Add a emtpy structural node
                    ufd.Name = ufElement.Name;
                }
                else
                {
                    // This is for compound structures, ie reviewer in hreview is a hcard
                    // Need to find a second attribute value to do this
                    HtmlAttribute att = baseNode.Attributes[ufElement.CompoundAttribute];
                    if (att != null)
                    {
                        if (UfHelpers.FindAttributeValue(att.Value.ToLower(), ufElement.CompoundName))
                        {
                            // Add a emtpy structural node using compound name
                            ufd.Name = ufElement.CompoundName;
                        }
                    }
                }

                // Recursion through the dom structure
                foreach (UfElementDescriber ufChildElement in ufElement.Elements)
                {
                    ParseUfElement(baseNode, ufChildElement, ufd, false);
                }
            }


            // A value needs to be found
            if (ufElement.Type != UfElementDescriber.PropertyTypes.None)
            {
                // Find child nodes with "value" or "value-title" classes
                HtmlNodeCollection valueNodes      = null;
                HtmlNodeCollection valueTitleNodes = null;

                // The value pattern
                if (ufElement.Elements["value"] == null && ufElement.Name != "value")
                {
                    valueNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value ')]");
                }


                // The value-title pattern is only allow for some property types ie dates
                // or name properties ie type, duration, geo, latitude and longitude
                if (ufElement.Type == UfElementDescriber.PropertyTypes.Date ||
                    ufElement.Name == "type" ||
                    ufElement.Name == "duration" ||
                    ufElement.Name == "geo" ||
                    ufElement.Name == "latitude" ||
                    ufElement.Name == "longitude")
                {
                    valueTitleNodes = baseNode.SelectNodes(".//*[contains(concat(' ', @class, ' '),' value-title ')]");
                }



                if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute || ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag || ufElement.Type == UfElementDescriber.PropertyTypes.UrlText)
                {
                    string text = UfHelpers.HtmlToText(baseNode, false);
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url);
                    string att  = GetAttributeValue(baseNode, ufElement.Attribute);
                    ufd.Name = ufElement.Name;

                    UfDataNode ufd1 = new UfDataNode();
                    UfDataNode ufd2 = new UfDataNode();
                    UfDataNode ufd3 = new UfDataNode();

                    ufd1.Name  = "text";
                    ufd1.Value = text;
                    ufd.Nodes.Add(ufd1);

                    ufd2.Name  = "link";
                    ufd2.Value = link;
                    ufd.Nodes.Add(ufd2);

                    // Add the attribute value used for XFN like structures
                    if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextAttribute)
                    {
                        ufd3.Name  = ufElement.Attribute;
                        ufd3.Value = att;
                        ufd.Nodes.Add(ufd3);
                    }

                    // Add the tag element of the url
                    if (ufElement.Type == UfElementDescriber.PropertyTypes.UrlTextTag)
                    {
                        ufd3.Name  = "tag";
                        ufd3.Value = UfHelpers.GetTagFromUrl(link);
                        ufd.Nodes.Add(ufd3);
                    }

                    if (ufElement.CompoundName == string.Empty)
                    {
                        ufData.Nodes.Add(ufd);
                    }
                    else
                    {
                        HtmlAttribute att1 = baseNode.Attributes[ufElement.CompoundAttribute];
                        if (att1 != null)
                        {
                            if (UfHelpers.FindAttributeValue(att1.Value.ToLower(), ufElement.CompoundName))
                            {
                                ufd.Name = ufElement.CompoundName;
                                ufData.Nodes.Add(ufd);
                            }
                        }
                    }
                }

                // The value excerpting pattern
                else if (valueNodes != null)
                {
                    string text = string.Empty;
                    foreach (HtmlNode node in valueNodes)
                    {
                        if (node.Name == "img" || node.Name == "area")
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += GetAttributeValue(node, "title").Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += GetAttributeValue(node, "title");
                            }
                        }
                        else if (node.Name == "abbr")
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += GetAttributeValue(node, "title").Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += GetAttributeValue(node, "title");
                            }
                        }
                        else
                        {
                            if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                            {
                                text += UfHelpers.HtmlToText(node, false).Replace(" ", "") + " ";
                            }
                            else
                            {
                                text += UfHelpers.HtmlToText(node, false) + " ";
                            }
                        }
                    }

                    if (ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                    {
                        // Take the fagmented bits and create a true ISODateTime string
                        ISODateTime isoDateTime = new ISODateTime();
                        text = isoDateTime.ParseUFFragmented(text);
                    }

                    ufd.Name  = ufElement.Name;
                    ufd.Value = text.Trim();
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // The value-title excerpting pattern
                else if (valueTitleNodes != null)
                {
                    string text = GetAttributeValue(valueTitleNodes[0], "title");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Url from "a" or "link"
                else if ((baseNode.Name == "a" || baseNode.Name == "link") && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                {
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "href"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = link;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Url from "img"
                else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                {
                    string link = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = link;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Email from "a" or "link"
                else if (baseNode.Name == "a" && GetAttributeValue(baseNode, "href") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Email)
                {
                    string address = UfHelpers.CleanEmailAddress(GetAttributeValue(baseNode, "href"));
                    ufd.Name  = ufElement.Name;
                    ufd.Value = address;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Tel from "object"
                else if (baseNode.Name == "object" && (GetAttributeValue(baseNode, "data") != "") && ufElement.Name == "tel")
                {
                    UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "data"));
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Date from "time"
                else if (baseNode.Name == "time" && GetAttributeValue(baseNode, "datetime") != "" && ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                {
                    string text = GetAttributeValue(baseNode, "datetime");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Date from "abbr"
                else if (baseNode.Name == "abbr" && GetAttributeValue(baseNode, "title") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Date)
                {
                    string text = GetAttributeValue(baseNode, "title");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "abbr"
                else if (baseNode.Name == "abbr" || baseNode.Name == "acronym" && GetAttributeValue(baseNode, "title") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "title");
                    ufd.Name = ufElement.Name;

                    // This is for geo been used as a location in hcalandar
                    if (ufElement.CompoundName != string.Empty)
                    {
                        ufd.Name = ufElement.CompoundName;
                    }

                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "input"
                else if (baseNode.Name == "input" && GetAttributeValue(baseNode, "value") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "value");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Tel from "area"
                else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != "") && ufElement.Name == "tel")
                {
                    UfHelpers.TelOptimization(ufd, GetAttributeValue(baseNode, "href"));
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text and url from "area"
                else if (baseNode.Name == "area" && (GetAttributeValue(baseNode, "href") != string.Empty || GetAttributeValue(baseNode, "alt") != string.Empty))
                {
                    if ((ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Email) && GetAttributeValue(baseNode, "href") != string.Empty)
                    {
                        string text = GetAttributeValue(baseNode, "href");

                        if (ufElement.Type == UfElementDescriber.PropertyTypes.Email)
                        {
                            text = UfHelpers.CleanEmailAddress(text);
                        }

                        if (ufElement.Type == UfElementDescriber.PropertyTypes.Url)
                        {
                            text = UfHelpers.GetAbsoluteUrl(text, this.baseUrl, url);
                        }

                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                    else if (GetAttributeValue(baseNode, "alt") != string.Empty)
                    {
                        string text = GetAttributeValue(baseNode, "alt");
                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                }

                // Url/Image from "object"
                else if (baseNode.Name == "object" && GetAttributeValue(baseNode, "data") != string.Empty && (ufElement.Type == UfElementDescriber.PropertyTypes.Url || ufElement.Type == UfElementDescriber.PropertyTypes.Image))
                {
                    string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "data"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Image from "img" or "area"
                else if ((baseNode.Name == "img" || baseNode.Name == "area") && GetAttributeValue(baseNode, "src") != string.Empty && ufElement.Type == UfElementDescriber.PropertyTypes.Image)
                {
                    string text = UfHelpers.GetAbsoluteUrl(GetAttributeValue(baseNode, "src"), this.baseUrl, url);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                // Text from "img" longdesc attribute
                else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "longdesc") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "longdesc");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);;
                }


                // Text from "img" alt attribute
                else if (baseNode.Name == "img" && GetAttributeValue(baseNode, "alt") != string.Empty)
                {
                    string text = GetAttributeValue(baseNode, "alt");
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }


                // Text for type/value structures with no found children
                else if (ufElement.NodeType == UfElementDescriber.StructureTypes.TypeValuePair)
                {
                    // if no chidren nodes ie type/value are found use text
                    // the calls for a children node type and value are alway both thier parent
                    if (ufd.Nodes.Count == 0)
                    {
                        // Add text from node value
                        string text = UfHelpers.HtmlToText(baseNode, false);
                        ufd.Name  = ufElement.Name;
                        ufd.Value = text;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                    else
                    {
                        // Add child type/value pair
                        ufd.Name = ufElement.Name;
                        AddNewDateNode(baseNode, ufData, ufd, ufElement);
                    }
                }

                // Text from Html node collect
                else if (ufElement.Type == UfElementDescriber.PropertyTypes.FormattedText)
                {
                    string text = UfHelpers.HtmlToText(baseNode, true);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }

                else
                {
                    // Text from node value
                    //string text = FindValuePattern(baseNode, ufElement);
                    //if(text == string.Empty)
                    //    text = HtmlToText(baseNode, false);

                    string text = UfHelpers.HtmlToText(baseNode, false);
                    ufd.Name  = ufElement.Name;
                    ufd.Value = text;
                    AddNewDateNode(baseNode, ufData, ufd, ufElement);
                }
            }
            else
            {
                AddNewDateNode(baseNode, ufData, ufd, ufElement);
            }
        }
Пример #3
0
        /// <summary>
        /// Load and parse a Html document.
        /// </summary>
        /// <param name="document">HtmlAgilityPack Htmldocument object</param>
        /// <param name="url">The source Url of the document</param>
        /// <param name="formatDescriber">The microformat format describer</param>
        public void Load(HtmlDocument document, string url, UfFormatDescriber formatDescriber)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }

            this.url             = url;
            this.formatDescriber = formatDescriber;
            this.document        = document;

            // Add in the whole html string from the page into the top data node
            data.OuterHtml = this.document.DocumentNode.OuterHtml;

            HtmlNodeCollection nodes;

            this.baseUrl = FindDocumentNodeAttributeValue("//html", "xml:base");
            this.baseUrl = FindDocumentNodeAttributeValue("//body", "xml:base");
            this.baseUrl = FindDocumentNodeAttributeValue("//base", "href");

            // Find the html page title
            nodes = this.document.DocumentNode.SelectNodes("//title");
            if (nodes != null)
            {
                foreach (HtmlNode node in nodes)
                {
                    this.htmlPageTitle = node.InnerText;
                }
            }



            // Start with document node
            this.startNode = document.DocumentNode;

            //// Find any fragment select
            //// <a name="profile"> html nodes </a>
            if (url != "")
            {
                Uri    uri  = new Uri(url);
                string frag = uri.Fragment;
                if (frag != string.Empty)
                {
                    try
                    {
                        // A name based fragment selection
                        nodes = this.document.DocumentNode.SelectNodes("//a[@name='" + frag.Replace("#", "") + "']");
                        if (nodes != null)
                        {
                            this.startNode = nodes[0];
                        }
                        else
                        {
                            // ID based fragment selection
                            nodes          = this.document.DocumentNode.SelectNodes("//*[@id='" + frag.Replace("#", "") + "']");
                            this.startNode = nodes[0];
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("Could not find name fragment" + frag);
                    }
                }
            }


            // Starts recursion
            ParseUfElement(this.startNode, this.FormatDescriber.BaseElement, this.Data, true);

            UfHelpers.RunNodeOptimization(this.Data);
        }
Пример #4
0
        private void ParseUfElement(HtmlNode baseNode, UfElementDescriber ufElement, UfDataNode ufData, bool ufTopLevel)
        {
            // Select nodes with required attribute: class, rel or rev
            HtmlNodeCollection nodes = baseNode.SelectNodes(".//@" + ufElement.Attribute);

            if (nodes != null)
            {
                foreach (HtmlNode node in nodes)
                {
                    // Load the attribute class, rel or rev
                    HtmlAttribute att         = node.Attributes[ufElement.Attribute];
                    HtmlAttribute compoundAtt = node.Attributes[ufElement.CompoundAttribute];
                    if (att != null)
                    {
                        // We are dealing with elemental uf like XFN
                        if (ufElement.AttributeValues.Count > 0)
                        {
                            bool found = false;
                            // Search for a mulitple attribute values ie friend or contact in rel
                            foreach (UfAttributeValueDescriber avd in ufElement.AttributeValues)
                            {
                                if (UfHelpers.FindAttributeValue(att.Value.ToLower(), avd.Name))
                                {
                                    found = true;
                                }
                            }
                            if (found)
                            {
                                // Adds the Html from which uf is parsed
                                // ufData.OuterHtml = node.OuterHtml;
                                ParseUfElementValue(node, ufElement, ufData);
                            }
                        }
                        else
                        {
                            bool found = false;

                            // Search for a single attribute values ie hcard in class
                            if (UfHelpers.FindAttributeValue(att.Value.ToLower(), ufElement.Name) && ufElement.CompoundName == string.Empty)
                            {
                                found = true;
                            }

                            // Search for a dual attribute values
                            // This is for compound structures, ie reviewer in hreview which is a hcard
                            if (UfHelpers.FindAttributeValue(att.Value.ToLower(), ufElement.Name) && UfHelpers.FindAttributeValue(compoundAtt.Value.ToLower(), ufElement.CompoundName))
                            {
                                found = true;
                            }

                            if (found)
                            {
                                if (HasCompound(baseNode, node.ParentNode, false) == false || ufElement.RootElement == true)
                                {
                                    HtmlNodeCollection includeRefNodes = null;

                                    includeRefNodes = node.SelectNodes(".//a[@class[contains(.,'include')]]");
                                    if (includeRefNodes != null)
                                    {
                                        foreach (HtmlNode includeRefNode in includeRefNodes)
                                        {
                                            string link = GetAttributeValue(includeRefNode, "href");
                                            if (link.StartsWith("#"))
                                            {
                                                link = link.Replace("#", "");
                                                HtmlNodeCollection includeNodes = node.SelectNodes("//*[@id='" + link + "']");
                                                if (includeNodes != null && includeNodes.Count > 0)
                                                {
                                                    node.AppendChild(HtmlNode.CreateNode("<div>" + includeNodes[0].OuterHtml + "</div>"));
                                                }
                                            }
                                        }
                                    }

                                    includeRefNodes = node.SelectNodes(".//object[@class[contains(.,'include')]]");
                                    if (includeRefNodes != null)
                                    {
                                        foreach (HtmlNode includeRefNode in includeRefNodes)
                                        {
                                            string link = GetAttributeValue(includeRefNode, "data");
                                            if (link.StartsWith("#"))
                                            {
                                                link = link.Replace("#", "");
                                                HtmlNodeCollection includeNodes = node.SelectNodes("//*[@id='" + link + "']");
                                                if (includeNodes != null && includeNodes.Count > 0)
                                                {
                                                    node.AppendChild(HtmlNode.CreateNode("<div>" + includeNodes[0].OuterHtml + "</div>"));
                                                }
                                            }
                                        }
                                    }

                                    // For TD
                                    // Finds table head include pattern and appends node collection
                                    if (node.Name == "td" && GetAttributeValue(node, "headers") != string.Empty)
                                    {
                                        string   link      = GetAttributeValue(node, "headers");
                                        string[] itemArray = new string[1];
                                        itemArray[0] = link;
                                        if (link.IndexOf(' ') > -1)
                                        {
                                            itemArray = link.Split(' ');
                                        }

                                        for (int i = 0; i < itemArray.Length; i++)
                                        {
                                            HtmlNodeCollection includeNodes = null;
                                            includeNodes = this.startNode.SelectNodes("//node()[@id='" + itemArray[i].Trim() + "']");
                                            if (includeNodes != null && includeNodes.Count > 0)
                                            {
                                                // Appends fresh node to avoid overload issues
                                                foreach (HtmlNode childNode in includeNodes)
                                                {
                                                    node.AppendChild(HtmlNode.CreateNode("<div>" + childNode.OuterHtml + "</div>"));
                                                }
                                            }
                                        }
                                    }

                                    // For TR
                                    // Finds table head include pattern and appends node collection
                                    if (node.Name == "tr")
                                    {
                                        foreach (HtmlNode child in node.ChildNodes)
                                        {
                                            if (child.Name == "td" && GetAttributeValue(child, "headers") != string.Empty)
                                            {
                                                string   link      = GetAttributeValue(child, "headers");
                                                string[] itemArray = new string[1];
                                                itemArray[0] = link;
                                                if (link.IndexOf(' ') > -1)
                                                {
                                                    itemArray = link.Split(' ');
                                                }

                                                for (int i = 0; i < itemArray.Length; i++)
                                                {
                                                    HtmlNodeCollection includeNodes = null;
                                                    includeNodes = this.startNode.SelectNodes("//node()[@id='" + itemArray[i].Trim() + "']");
                                                    if (includeNodes != null && includeNodes.Count > 0)
                                                    {
                                                        // Appends fresh node to avoid overload issues
                                                        foreach (HtmlNode childNode in includeNodes)
                                                        {
                                                            child.AppendChild(HtmlNode.CreateNode("<div>" + childNode.OuterHtml + "</div>"));
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }


                                    // Adds the Html from which uf is parsed
                                    foreach (HtmlNode childNode in node.ChildNodes)
                                    {
                                        ufData.OuterHtml += childNode.OuterHtml;
                                    }


                                    // Recursion
                                    if (ufElement.Multiples || ufElement.ConcatenateValues)
                                    {
                                        ParseUfElementValue(node, ufElement, ufData);
                                    }
                                    else
                                    // Dont add a second data node for a format decription that does not support either
                                    // multiples or concatenation of values
                                    if (ufData.Nodes.Exists(ufElement.Name) == false)
                                    {
                                        ParseUfElementValue(node, ufElement, ufData);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }