Example #1
0
        static Regex _shortHtmlComment = new Regex(@"^<!-.*->$"); // matches "<!-Extra_Images->"

        static XElement ParseTagNode(Majestic12.HTMLchunk m12chunk, string originalHtml, ref int xmlnsIndex)
        {

            if (string.IsNullOrEmpty(m12chunk.sTag))
            {

                if (m12chunk.sParams.Length > 0 && m12chunk.sParams[0].ToLower().Equals("doctype"))
                    return new XElement("doctype");

                if (_weirdTag.IsMatch(originalHtml))
                    return new XElement("REMOVED_weirdBlockParenthesisTag");

                if (_aspnetPrecompiled.IsMatch(originalHtml))
                    return new XElement("REMOVED_ASPNET_PrecompiledDirective");

                if (_shortHtmlComment.IsMatch(originalHtml))
                    return new XElement("REMOVED_ShortHtmlComment");

                // Nodes like "<br <br>" will end up with a m12chunk.sTag==""...  We discard these nodes.
                return null;
            }

            string tagName = CleanupTagName(m12chunk.sTag, originalHtml);

            XElement result = new XElement(tagName);

            List<XAttribute> attributes = new List<XAttribute>();

            for (int i = 0; i < m12chunk.iParams; i++)
            {

                if (m12chunk.sParams[i] == "<!--")
                {

                    // an HTML comment was embedded within a tag.  This comment and its contents
                    // will be interpreted as attributes by Majestic-12... skip this attributes
                    for (; i < m12chunk.iParams; i++)
                    {

                        if (m12chunk.sTag == "--" || m12chunk.sTag == "-->")
                            break;
                    }

                    continue;
                }

                if (m12chunk.sParams[i] == "?" && string.IsNullOrEmpty(m12chunk.sValues[i]))
                    continue;

                string attributeName = m12chunk.sParams[i];

                if (!TryCleanupAttributeName(attributeName, ref xmlnsIndex, out attributeName))
                    continue;

                attributes.Add(new XAttribute(attributeName, m12chunk.sValues[i]));
            }

            // If attributes are duplicated with different values, we complain.
            // If attributes are duplicated with the same value, we remove all but 1.
            var duplicatedAttributes = attributes.GroupBy(A => A.Name).Where(G => G.Count() > 1);

            foreach (var duplicatedAttribute in duplicatedAttributes)
            {

                if (duplicatedAttribute.GroupBy(DA => DA.Value).Count() > 1)
                    throw new Exception("Attribute value was given different values");

                attributes.RemoveAll(A => A.Name == duplicatedAttribute.Key);
                attributes.Add(duplicatedAttribute.First());
            }

            result.Add(attributes);

            return result;
        }
Example #2
0
        static XElement FindParentOfNewNode(Majestic12.HTMLchunk m12chunk, string originalHtml, XElement nextPotentialParent)
        {

            string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml);

            XElement discoveredParent = null;

            // Get a list of all ancestors
            List<XElement> ancestors = new List<XElement>();
            XElement ancestor = nextPotentialParent;
            while (ancestor != null)
            {
                ancestors.Add(ancestor);
                ancestor = ancestor.Parent;
            }

            // Check if the new tag implies a previous tag was closed.
            if ("form" == m12chunkCleanedTag)
            {

                discoveredParent = ancestors
                    .Where(XE => m12chunkCleanedTag == XE.Name)
                    .Take(1)
                    .Select(XE => XE.Parent)
                    .FirstOrDefault();
            }
            else if ("td" == m12chunkCleanedTag)
            {

                discoveredParent = ancestors
                    .TakeWhile(XE => "tr" != XE.Name)
                    .Where(XE => m12chunkCleanedTag == XE.Name)
                    .Take(1)
                    .Select(XE => XE.Parent)
                    .FirstOrDefault();
            }
            else if ("tr" == m12chunkCleanedTag)
            {

                discoveredParent = ancestors
                    .TakeWhile(XE => !("table" == XE.Name
                                        || "thead" == XE.Name
                                        || "tbody" == XE.Name
                                        || "tfoot" == XE.Name))
                    .Where(XE => m12chunkCleanedTag == XE.Name)
                    .Take(1)
                    .Select(XE => XE.Parent)
                    .FirstOrDefault();
            }
            else if ("thead" == m12chunkCleanedTag
                      || "tbody" == m12chunkCleanedTag
                      || "tfoot" == m12chunkCleanedTag)
            {


                discoveredParent = ancestors
                    .TakeWhile(XE => "table" != XE.Name)
                    .Where(XE => m12chunkCleanedTag == XE.Name)
                    .Take(1)
                    .Select(XE => XE.Parent)
                    .FirstOrDefault();
            }

            return discoveredParent ?? nextPotentialParent;
        }