internal float GetLinksDensity(XElement element)
        {
            string elementInnerText = GetInnerText(element);
              int elementInnerTextLength = elementInnerText.Length;

              if (elementInnerTextLength == 0)
              {
            // we won't divide by zero
            return 0.0f;
              }

              int linksLength =
            element.GetElementsByTagName("a")
              .Sum(anchorElement => GetInnerText(anchorElement).Length);

              return (float)linksLength / elementInnerTextLength;
        }
        /// <summary>
        /// Cleans out spurious headers from a <paramref name="element" />. Checks things like classnames and link density.
        /// </summary>
        internal void CleanHeaders(XElement element)
        {
            var elementsToRemove = new List<XElement>();

              for (int headerLevel = 1; headerLevel < 7; headerLevel++)
              {
            var headerElements = element.GetElementsByTagName("h" + headerLevel);

            foreach (var headerElement in headerElements)
            {
              if (GetClassWeight(headerElement) < 0
               || GetLinksDensity(headerElement) > _MaxHeaderLinksDensity)
              {
            elementsToRemove.Add(headerElement);
              }
            }
              }

              RemoveElements(elementsToRemove);
        }
        /// <summary>
        /// Looks for any paging links that may occur within the document
        /// </summary>
        /// <param name="body">Content body</param>
        /// <param name="url">Url of document</param>
        internal string FindNextPageLink(XElement body, string url)
        {
            var possiblePages = new Dictionary<string, LinkData>();
              var allLinks = body.GetElementsByTagName("a");
              var articleBaseUrl = FindBaseUrl(url);

              /* Loop through all links, looking for hints that they may be next-page links.
               * Things like having "page" in their textContent, className or id, or being a child
               * of a node with a page-y className or id.
               * After we do that, assign each page a score.
               */
              foreach (var link in allLinks)
              {
            string linkHref = (string)link.Attribute("href");

            if (string.IsNullOrEmpty(linkHref))
              continue;

            if (_MailtoHrefRegex.IsMatch(linkHref))
              continue;

            linkHref = Regex.Replace(linkHref, "#.*$", "");
            linkHref = Regex.Replace(linkHref, "/$", "");

            /* If we've already seen this page, then ignore it. */
            // This leaves out an already-checked page check, because
            // the web transcoder is seperate from the original transcoder
            if (linkHref == "" || linkHref == articleBaseUrl || linkHref == url)
              continue;

            /* If it's on a different domain, skip it. */
            Uri linkHrefUri;
            if (Uri.TryCreate(linkHref, UriKind.Absolute, out linkHrefUri) && linkHrefUri.Host != new Uri(articleBaseUrl).Host)
              continue;

            string linkText = GetInnerText(link);

            /* If the linktext looks like it's not the next page, then skip it */
            if (_Extraneous.IsMatch(linkText) || linkText.Length > 25)
              continue;

            /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
            string linkHrefLeftover = linkHref.Replace(articleBaseUrl, "");
            if (!Regex.IsMatch(linkHrefLeftover, @"\d"))
              continue;

            if (!possiblePages.Keys.Contains(linkHref))
            {
              possiblePages[linkHref] = new LinkData { Score = 0, LinkHref = linkHref, LinkText = linkText };
            }
            else
            {
              possiblePages[linkHref].LinkText += " | " + linkText;
            }

            var linkObj = possiblePages[linkHref];

            /*
             * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
             * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
             */
            if (linkHref.IndexOf(articleBaseUrl) == -1)
              linkObj.Score -= 25;

            string linkData = linkText + " " + link.GetClass() + " " + link.GetId();

            if (_NextLink.IsMatch(linkData))
              linkObj.Score += 50;

            if (Regex.IsMatch(linkData, "pag(e|ing|inat)", RegexOptions.IgnoreCase))
              linkObj.Score += 25;

            /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
            /* -65 is enough to negate any bonuses gotten from a > or � in the text */
            if (Regex.IsMatch(linkData, "(first|last)", RegexOptions.IgnoreCase))
              if (!_NextLink.IsMatch(linkObj.LinkText))
            linkObj.Score -= 65;

            if (_NegativeWeightRegex.IsMatch(linkData) || _Extraneous.IsMatch(linkData))
              linkObj.Score -= 50;

            if (_PrevLink.IsMatch(linkData))
              linkObj.Score -= 200;

            /* If any ancestor node contains page or paging or paginat */
            var parentNode = link.Parent;
            bool positiveNodeMatch = false;
            bool negativeNodeMatch = false;

            while (parentNode != null)
            {
              string parentNodeClassAndId = parentNode.GetClass() + " " + parentNode.GetId();
              if (!positiveNodeMatch && Regex.IsMatch(parentNodeClassAndId, "pag(e|ing|inat)", RegexOptions.IgnoreCase))
              {
            positiveNodeMatch = true;
            linkObj.Score += 25;
              }
              if (!negativeNodeMatch && _NegativeWeightRegex.IsMatch(parentNodeClassAndId))
              {
            if (!_PositiveWeightRegex.IsMatch(parentNodeClassAndId))
            {
              linkObj.Score -= 25;
              negativeNodeMatch = true;
            }
              }

              parentNode = parentNode.Parent;
            }

            /*
            * If the URL looks like it has paging in it, add to the score.
            * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
            */
            if (Regex.IsMatch(linkHref, @"p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}", RegexOptions.IgnoreCase) ||
            Regex.IsMatch(linkHref, @"(page|paging)", RegexOptions.IgnoreCase))
            {
              linkObj.Score += 25;
            }

            /* If the URL contains negative values, give a slight decrease. */
            if (_Extraneous.IsMatch(linkHref))
            {
              linkObj.Score -= 15;
            }

            /*
             * If the link text can be parsed as a number, give it a minor bonus, with a slight
             * bias towards lower numbered pages. This is so that pages that might not have 'next'
             * in their text can still get scored, and sorted properly by score.
             */
            int linkTextAsNumber;
            bool isInt = int.TryParse(linkText, out linkTextAsNumber);
            if (isInt)
            {
              /* Punish 1 since we're either already there, or it's probably before what we want anyways. */
              if (linkTextAsNumber == 1)
            linkObj.Score -= 10;
              else
            linkObj.Score += Math.Max(0, 10 - linkTextAsNumber);
            }
              }

              /*
              * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
              * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
              */
              LinkData topPage = null;
              foreach (var page in possiblePages.Keys)
              {
            if (possiblePages[page].Score >= 50 && (topPage == null || topPage.Score < possiblePages[page].Score))
              topPage = possiblePages[page];
              }

              if (topPage != null)
              {
            string nextHref = Regex.Replace(topPage.LinkHref, @"\/$", "");
            var nextHrefUri = new Uri(new Uri(articleBaseUrl), nextHref);

            return nextHrefUri.ToString();
              }

              return null;
        }
        /// <summary>
        /// Cleans an element of all elements with name <paramref name="elementName" />.
        /// (Unless it's a youtube/vimeo video. People love movies.)
        /// </summary>
        internal void Clean(XElement rootElement, string elementName)
        {
            var elements = rootElement.GetElementsByTagName(elementName);
              bool isEmbed = "object".Equals(elementName, StringComparison.OrdinalIgnoreCase)
                  || "embed".Equals(elementName, StringComparison.OrdinalIgnoreCase);
              var elementsToRemove = new List<XElement>();

              foreach (var element in elements)
              {
            /* Allow youtube and vimeo videos through as people usually want to see those. */
            if (isEmbed
             && (_VideoRegex.IsMatch(element.GetAttributesString("|"))
              || _VideoRegex.IsMatch(element.GetInnerHtml())))
            {
              continue;
            }

            elementsToRemove.Add(element);
              }

              RemoveElements(elementsToRemove);
        }
        /// <summary>
        /// Cleans a <paramref name="rootElement" /> of all elements with name <paramref name="elementName" /> if they look fishy.
        /// "Fishy" is an algorithm based on content length, classnames, link density, number of images and embeds, etc.
        /// </summary>
        internal void CleanConditionally(XElement rootElement, string elementName)
        {
            if (elementName == null)
              {
            throw new ArgumentNullException("elementName");
              }

              var elements = rootElement.GetElementsByTagName(elementName);
              var elementsToRemove = new List<XElement>();

              foreach (var element in elements)
              {
            int weight = GetClassWeight(element);
            float score = GetElementScore(element);

            if (weight + score < 0.0f)
            {
              elementsToRemove.Add(element);
              continue;
            }

            /* If there are not very many commas and the number of non-paragraph elements
             * is more than paragraphs or other ominous signs, remove the element. */

            string elementInnerText = GetInnerText(element);

            if (GetSegmentsCount(elementInnerText, ',') < _MinCommaSegments)
            {
              int psCount = element.GetElementsByTagName("p").Count();
              int imgsCount = element.GetElementsByTagName("img").Count();
              int lisCount = element.GetElementsByTagName("li").Count();
              int inputsCount = element.GetElementsByTagName("input").Count();

              // while counting embeds we omit video-embeds
              int embedsCount =
            element.GetElementsByTagName("embed")
              .Count(embedElement => !_VideoRegex.IsMatch(embedElement.GetAttributeValue("src", "")));

              float linksDensity = GetLinksDensity(element);
              int innerTextLength = elementInnerText.Length;
              string elementNameLower = elementName.Trim().ToLower();
              bool remove = (imgsCount > psCount)
                     || (lisCount - _LisCountTreshold > psCount && elementNameLower != "ul" && elementNameLower != "ol")
                     || (inputsCount > psCount / 3)
                     || (innerTextLength < _MinInnerTextLength && (imgsCount == 0 || imgsCount > _MaxImagesInShortSegmentsCount))
                     || (weight < _ClassWeightTreshold && linksDensity > _MaxDensityForElementsWithSmallerClassWeight)
                     || (weight >= _ClassWeightTreshold && linksDensity > _MaxDensityForElementsWithGreaterClassWeight)
                     || (embedsCount > _MaxEmbedsCount || (embedsCount == _MaxEmbedsCount && innerTextLength < _MinInnerTextLengthInElementsWithEmbed));

              if (remove)
              {
            elementsToRemove.Add(element);
              }

            }
              } /* end foreach */

              RemoveElements(elementsToRemove);
        }
        /**
        * Processes a phone number description element from the XML file and returns it as a
        * PhoneNumberDesc. If the description element is a fixed line or mobile number, the general
        * description will be used to fill in the whole element if necessary, or any components that are
        * missing. For all other types, the general description will only be used to fill in missing
        * components if the type has a partial definition. For example, if no "tollFree" element exists,
        * we assume there are no toll free numbers for that locale, and return a phone number description
        * with "NA" for both the national and possible number patterns.
        *
        * @param generalDesc  a generic phone number description that will be used to fill in missing
        *                     parts of the description
        * @param countryElement  the XML element representing all the country information
        * @param numberType  the name of the number type, corresponding to the appropriate tag in the XML
        *                    file with information about that type
        * @return  complete description of that phone number type
        */
        public static PhoneNumberDesc ProcessPhoneNumberDescElement(PhoneNumberDesc generalDesc,
            XElement countryElement, String numberType, bool liteBuild) {
            if (generalDesc == null)
                generalDesc = new PhoneNumberDesc.Builder().Build();
            var phoneNumberDescList = countryElement.GetElementsByTagName(numberType);
            var numberDesc = new PhoneNumberDesc.Builder();
            if (phoneNumberDescList.Count() == 0 && !IsValidNumberType(numberType)) {
                numberDesc.SetNationalNumberPattern("NA");
                numberDesc.SetPossibleNumberPattern("NA");
                return numberDesc.Build();
            }
            numberDesc.MergeFrom(generalDesc);
            if (phoneNumberDescList.Count() > 0) {
                XElement element = phoneNumberDescList.First();
                var possiblePattern = element.GetElementsByTagName(POSSIBLE_NUMBER_PATTERN);
                if (possiblePattern.Count() > 0)
                    numberDesc.SetPossibleNumberPattern(ValidateRE(possiblePattern.First().Value, true));

                var validPattern = element.GetElementsByTagName(NATIONAL_NUMBER_PATTERN);
                if (validPattern.Count() > 0)
                    numberDesc.SetNationalNumberPattern(ValidateRE(validPattern.First().Value, true));

                if (!liteBuild) {
                    var exampleNumber = element.GetElementsByTagName(EXAMPLE_NUMBER);
                    if (exampleNumber.Count() > 0)
                        numberDesc.SetExampleNumber(exampleNumber.First().Value);
                }
            }
            return numberDesc.Build();
        }
        internal void PrepareArticleContentElement(XElement articleContentElement)
        {
            CleanStyles(articleContentElement);
              KillBreaks(articleContentElement);

              /* Clean out junk from the article content. */
              Clean(articleContentElement, "form");
              Clean(articleContentElement, "object");
              Clean(articleContentElement, "h1");

              /* If there is only one h2, they are probably using it as a header and not a subheader,
               * so remove it since we already have a header. */
              if (articleContentElement.GetElementsByTagName("h2").Count() == 1)
              {
            Clean(articleContentElement, "h2");
              }

              Clean(articleContentElement, "iframe");
              CleanHeaders(articleContentElement);

              /* Do these last as the previous stuff may have removed junk that will affect these. */
              CleanConditionally(articleContentElement, "table");
              CleanConditionally(articleContentElement, "ul");
              CleanConditionally(articleContentElement, "div");

              /* Remove extra paragraphs. */
              var paraElements = articleContentElement.GetElementsByTagName("p");
              var elementsToRemove = new List<XElement>();

              foreach (var paraElement in paraElements)
              {
            string innerText = GetInnerText(paraElement, false);
            if (innerText.Length > 0) { continue; }

            int imgsCount = paraElement.GetElementsByTagName("img").Count();
            if (imgsCount > 0) { continue; }

            int embedsCount = paraElement.GetElementsByTagName("embed").Count();
            if (embedsCount > 0) { continue; }

            int objectsCount = paraElement.GetElementsByTagName("object").Count();
            if (objectsCount > 0) { continue; }

            // We have a paragraph with empty inner text, with no images, no embeds and no objects.
            // Let's remove it.
            elementsToRemove.Add(paraElement);
              }

              RemoveElements(elementsToRemove);

              /* Remove br's that are directly before paragraphs. */
              articleContentElement.SetInnerHtml(_BreakBeforeParagraphRegex.Replace(articleContentElement.GetInnerHtml(), "<p"));
        }
 public static void SetLeadingDigitsPatterns(XElement numberFormatElement, NumberFormat.Builder format) {
     foreach (XElement e in numberFormatElement.GetElementsByTagName(LEADING_DIGITS)) {
         format.AddLeadingDigitsPattern(ValidateRE(e.Value, true));
     }
 }
        /**
        *  Extracts the available formats from the provided DOM element. If it does not contain any
        *  nationalPrefixFormattingRule, the one passed-in is retained. The nationalPrefix,
        *  nationalPrefixFormattingRule and nationalPrefixOptionalWhenFormatting values are provided from
        *  the parent (territory) element.
        */
        // @VisibleForTesting
        public static void LoadAvailableFormats(PhoneMetadata.Builder metadata,
                                         XElement element, String nationalPrefix,
                                         String nationalPrefixFormattingRule,
                                         bool nationalPrefixOptionalWhenFormatting) {
            String carrierCodeFormattingRule = "";
            if (element.HasAttribute(CARRIER_CODE_FORMATTING_RULE)) {
                carrierCodeFormattingRule = ValidateRE(
                    GetDomesticCarrierCodeFormattingRuleFromElement(element, nationalPrefix));
            }
            var numberFormatElements = element.GetElementsByTagName(NUMBER_FORMAT);
            bool hasExplicitIntlFormatDefined = false;

            int numOfFormatElements = numberFormatElements.Count();
            if (numOfFormatElements > 0) {
                foreach (XElement numberFormatElement in numberFormatElements) {
                    var format = new NumberFormat.Builder();

                    if (numberFormatElement.HasAttribute(NATIONAL_PREFIX_FORMATTING_RULE)) {
                        format.SetNationalPrefixFormattingRule(
                            GetNationalPrefixFormattingRuleFromElement(numberFormatElement, nationalPrefix));
                        format.SetNationalPrefixOptionalWhenFormatting(
                            numberFormatElement.HasAttribute(NATIONAL_PREFIX_OPTIONAL_WHEN_FORMATTING));

                    } else {
                        format.SetNationalPrefixFormattingRule(nationalPrefixFormattingRule);
                        format.SetNationalPrefixOptionalWhenFormatting(nationalPrefixOptionalWhenFormatting);
                    }
                    if (numberFormatElement.HasAttribute("carrierCodeFormattingRule")) {
                        format.SetDomesticCarrierCodeFormattingRule(ValidateRE(
                            GetDomesticCarrierCodeFormattingRuleFromElement(
                                numberFormatElement, nationalPrefix)));
                    } else {
                        format.SetDomesticCarrierCodeFormattingRule(carrierCodeFormattingRule);
                    }

                    // Extract the pattern for the national format.
                    String nationalFormat =
                        LoadNationalFormat(metadata, numberFormatElement, format);
                    metadata.AddNumberFormat(format);

                    if (LoadInternationalFormat(metadata, numberFormatElement, nationalFormat)) {
                        hasExplicitIntlFormatDefined = true;
                    }
                }
                // Only a small number of regions need to specify the intlFormats in the xml. For the majority
                // of countries the intlNumberFormat metadata is an exact copy of the national NumberFormat
                // metadata. To minimize the size of the metadata file, we only keep intlNumberFormats that
                // actually differ in some way to the national formats.
                if (!hasExplicitIntlFormatDefined) {
                    metadata.ClearIntlNumberFormat();
                }
            }
        }
        /**
         * Extracts the pattern for the national format.
         *
         * @throws  RuntimeException if multiple or no formats have been encountered.
         * @return  the national format string.
         */
        // @VisibleForTesting
        public static String LoadNationalFormat(PhoneMetadata.Builder metadata, XElement numberFormatElement,
                                         NumberFormat.Builder format) {
            SetLeadingDigitsPatterns(numberFormatElement, format);
            format.SetPattern(ValidateRE(numberFormatElement.GetAttribute(PATTERN)));

            var formatPattern = numberFormatElement.GetElementsByTagName(FORMAT);
            if (formatPattern.Count() != 1) {
                //LOGGER.log(Level.SEVERE,
                //           "Only one format pattern for a numberFormat element should be defined.");
                throw new Exception("Invalid number of format patterns for country: " +
                                    metadata.Id);
            }
            String nationalFormat = formatPattern.First().Value;
            format.SetFormat(nationalFormat);
            return nationalFormat;
        }
        /**
        * Extracts the pattern for international format. If there is no intlFormat, default to using the
        * national format. If the intlFormat is set to "NA" the intlFormat should be ignored.
        *
        * @throws  RuntimeException if multiple intlFormats have been encountered.
        * @return  whether an international number format is defined.
        */
        // @VisibleForTesting
        public static bool LoadInternationalFormat(PhoneMetadata.Builder metadata,
            XElement numberFormatElement,
            String nationalFormat) {
            NumberFormat.Builder intlFormat = new NumberFormat.Builder();
            SetLeadingDigitsPatterns(numberFormatElement, intlFormat);
            intlFormat.SetPattern(numberFormatElement.GetAttribute(PATTERN));
            var intlFormatPattern = numberFormatElement.GetElementsByTagName(INTL_FORMAT);
            bool hasExplicitIntlFormatDefined = false;

            if (intlFormatPattern.Count() > 1) {
                //LOGGER.log(Level.SEVERE,
                //          "A maximum of one intlFormat pattern for a numberFormat element should be " +
                //           "defined.");
                throw new Exception("Invalid number of intlFormat patterns for country: " +
                                    metadata.Id);
            } else if (intlFormatPattern.Count() == 0) {
                // Default to use the same as the national pattern if none is defined.
                intlFormat.SetFormat(nationalFormat);
            } else {
                String intlFormatPatternValue = intlFormatPattern.First().Value;
                if (!intlFormatPatternValue.Equals("NA")) {
                    intlFormat.SetFormat(intlFormatPatternValue);
                }
                hasExplicitIntlFormatDefined = true;
            }

            if (intlFormat.HasFormat) {
                metadata.AddIntlNumberFormat(intlFormat);
            }
            return hasExplicitIntlFormatDefined;
        }
Example #12
0
using System;