internal float GetLinksDensity(XElement element) { string elementInnerText = GetInnerText(element); int elementInnerTextLength = elementInnerText.Length; if (elementInnerTextLength == 0) { // we won't divide by zero return 0.0f; } int linksLength = element.GetElementsByTagName("a") .Sum(anchorElement => GetInnerText(anchorElement).Length); return (float)linksLength / elementInnerTextLength; }
/// <summary> /// Cleans out spurious headers from a <paramref name="element" />. Checks things like classnames and link density. /// </summary> internal void CleanHeaders(XElement element) { var elementsToRemove = new List<XElement>(); for (int headerLevel = 1; headerLevel < 7; headerLevel++) { var headerElements = element.GetElementsByTagName("h" + headerLevel); foreach (var headerElement in headerElements) { if (GetClassWeight(headerElement) < 0 || GetLinksDensity(headerElement) > _MaxHeaderLinksDensity) { elementsToRemove.Add(headerElement); } } } RemoveElements(elementsToRemove); }
/// <summary> /// Looks for any paging links that may occur within the document /// </summary> /// <param name="body">Content body</param> /// <param name="url">Url of document</param> internal string FindNextPageLink(XElement body, string url) { var possiblePages = new Dictionary<string, LinkData>(); var allLinks = body.GetElementsByTagName("a"); var articleBaseUrl = FindBaseUrl(url); /* Loop through all links, looking for hints that they may be next-page links. * Things like having "page" in their textContent, className or id, or being a child * of a node with a page-y className or id. * After we do that, assign each page a score. */ foreach (var link in allLinks) { string linkHref = (string)link.Attribute("href"); if (string.IsNullOrEmpty(linkHref)) continue; if (_MailtoHrefRegex.IsMatch(linkHref)) continue; linkHref = Regex.Replace(linkHref, "#.*$", ""); linkHref = Regex.Replace(linkHref, "/$", ""); /* If we've already seen this page, then ignore it. */ // This leaves out an already-checked page check, because // the web transcoder is seperate from the original transcoder if (linkHref == "" || linkHref == articleBaseUrl || linkHref == url) continue; /* If it's on a different domain, skip it. */ Uri linkHrefUri; if (Uri.TryCreate(linkHref, UriKind.Absolute, out linkHrefUri) && linkHrefUri.Host != new Uri(articleBaseUrl).Host) continue; string linkText = GetInnerText(link); /* If the linktext looks like it's not the next page, then skip it */ if (_Extraneous.IsMatch(linkText) || linkText.Length > 25) continue; /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ string linkHrefLeftover = linkHref.Replace(articleBaseUrl, ""); if (!Regex.IsMatch(linkHrefLeftover, @"\d")) continue; if (!possiblePages.Keys.Contains(linkHref)) { possiblePages[linkHref] = new LinkData { Score = 0, LinkHref = linkHref, LinkText = linkText }; } else { possiblePages[linkHref].LinkText += " | " + linkText; } var linkObj = possiblePages[linkHref]; /* * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html */ if (linkHref.IndexOf(articleBaseUrl) == -1) linkObj.Score -= 25; string linkData = linkText + " " + link.GetClass() + " " + link.GetId(); if (_NextLink.IsMatch(linkData)) linkObj.Score += 50; if (Regex.IsMatch(linkData, "pag(e|ing|inat)", RegexOptions.IgnoreCase)) linkObj.Score += 25; /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ /* -65 is enough to negate any bonuses gotten from a > or � in the text */ if (Regex.IsMatch(linkData, "(first|last)", RegexOptions.IgnoreCase)) if (!_NextLink.IsMatch(linkObj.LinkText)) linkObj.Score -= 65; if (_NegativeWeightRegex.IsMatch(linkData) || _Extraneous.IsMatch(linkData)) linkObj.Score -= 50; if (_PrevLink.IsMatch(linkData)) linkObj.Score -= 200; /* If any ancestor node contains page or paging or paginat */ var parentNode = link.Parent; bool positiveNodeMatch = false; bool negativeNodeMatch = false; while (parentNode != null) { string parentNodeClassAndId = parentNode.GetClass() + " " + parentNode.GetId(); if (!positiveNodeMatch && Regex.IsMatch(parentNodeClassAndId, "pag(e|ing|inat)", RegexOptions.IgnoreCase)) { positiveNodeMatch = true; linkObj.Score += 25; } if (!negativeNodeMatch && _NegativeWeightRegex.IsMatch(parentNodeClassAndId)) { if (!_PositiveWeightRegex.IsMatch(parentNodeClassAndId)) { linkObj.Score -= 25; negativeNodeMatch = true; } } parentNode = parentNode.Parent; } /* * If the URL looks like it has paging in it, add to the score. * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 */ if (Regex.IsMatch(linkHref, @"p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}", RegexOptions.IgnoreCase) || Regex.IsMatch(linkHref, @"(page|paging)", RegexOptions.IgnoreCase)) { linkObj.Score += 25; } /* If the URL contains negative values, give a slight decrease. */ if (_Extraneous.IsMatch(linkHref)) { linkObj.Score -= 15; } /* * If the link text can be parsed as a number, give it a minor bonus, with a slight * bias towards lower numbered pages. This is so that pages that might not have 'next' * in their text can still get scored, and sorted properly by score. */ int linkTextAsNumber; bool isInt = int.TryParse(linkText, out linkTextAsNumber); if (isInt) { /* Punish 1 since we're either already there, or it's probably before what we want anyways. */ if (linkTextAsNumber == 1) linkObj.Score -= 10; else linkObj.Score += Math.Max(0, 10 - linkTextAsNumber); } } /* * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. * Require at least a score of 50, which is a relatively high confidence that this page is the next link. */ LinkData topPage = null; foreach (var page in possiblePages.Keys) { if (possiblePages[page].Score >= 50 && (topPage == null || topPage.Score < possiblePages[page].Score)) topPage = possiblePages[page]; } if (topPage != null) { string nextHref = Regex.Replace(topPage.LinkHref, @"\/$", ""); var nextHrefUri = new Uri(new Uri(articleBaseUrl), nextHref); return nextHrefUri.ToString(); } return null; }
/// <summary> /// Cleans an element of all elements with name <paramref name="elementName" />. /// (Unless it's a youtube/vimeo video. People love movies.) /// </summary> internal void Clean(XElement rootElement, string elementName) { var elements = rootElement.GetElementsByTagName(elementName); bool isEmbed = "object".Equals(elementName, StringComparison.OrdinalIgnoreCase) || "embed".Equals(elementName, StringComparison.OrdinalIgnoreCase); var elementsToRemove = new List<XElement>(); foreach (var element in elements) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if (isEmbed && (_VideoRegex.IsMatch(element.GetAttributesString("|")) || _VideoRegex.IsMatch(element.GetInnerHtml()))) { continue; } elementsToRemove.Add(element); } RemoveElements(elementsToRemove); }
/// <summary> /// Cleans a <paramref name="rootElement" /> of all elements with name <paramref name="elementName" /> if they look fishy. /// "Fishy" is an algorithm based on content length, classnames, link density, number of images and embeds, etc. /// </summary> internal void CleanConditionally(XElement rootElement, string elementName) { if (elementName == null) { throw new ArgumentNullException("elementName"); } var elements = rootElement.GetElementsByTagName(elementName); var elementsToRemove = new List<XElement>(); foreach (var element in elements) { int weight = GetClassWeight(element); float score = GetElementScore(element); if (weight + score < 0.0f) { elementsToRemove.Add(element); continue; } /* If there are not very many commas and the number of non-paragraph elements * is more than paragraphs or other ominous signs, remove the element. */ string elementInnerText = GetInnerText(element); if (GetSegmentsCount(elementInnerText, ',') < _MinCommaSegments) { int psCount = element.GetElementsByTagName("p").Count(); int imgsCount = element.GetElementsByTagName("img").Count(); int lisCount = element.GetElementsByTagName("li").Count(); int inputsCount = element.GetElementsByTagName("input").Count(); // while counting embeds we omit video-embeds int embedsCount = element.GetElementsByTagName("embed") .Count(embedElement => !_VideoRegex.IsMatch(embedElement.GetAttributeValue("src", ""))); float linksDensity = GetLinksDensity(element); int innerTextLength = elementInnerText.Length; string elementNameLower = elementName.Trim().ToLower(); bool remove = (imgsCount > psCount) || (lisCount - _LisCountTreshold > psCount && elementNameLower != "ul" && elementNameLower != "ol") || (inputsCount > psCount / 3) || (innerTextLength < _MinInnerTextLength && (imgsCount == 0 || imgsCount > _MaxImagesInShortSegmentsCount)) || (weight < _ClassWeightTreshold && linksDensity > _MaxDensityForElementsWithSmallerClassWeight) || (weight >= _ClassWeightTreshold && linksDensity > _MaxDensityForElementsWithGreaterClassWeight) || (embedsCount > _MaxEmbedsCount || (embedsCount == _MaxEmbedsCount && innerTextLength < _MinInnerTextLengthInElementsWithEmbed)); if (remove) { elementsToRemove.Add(element); } } } /* end foreach */ RemoveElements(elementsToRemove); }
/** * Processes a phone number description element from the XML file and returns it as a * PhoneNumberDesc. If the description element is a fixed line or mobile number, the general * description will be used to fill in the whole element if necessary, or any components that are * missing. For all other types, the general description will only be used to fill in missing * components if the type has a partial definition. For example, if no "tollFree" element exists, * we assume there are no toll free numbers for that locale, and return a phone number description * with "NA" for both the national and possible number patterns. * * @param generalDesc a generic phone number description that will be used to fill in missing * parts of the description * @param countryElement the XML element representing all the country information * @param numberType the name of the number type, corresponding to the appropriate tag in the XML * file with information about that type * @return complete description of that phone number type */ public static PhoneNumberDesc ProcessPhoneNumberDescElement(PhoneNumberDesc generalDesc, XElement countryElement, String numberType, bool liteBuild) { if (generalDesc == null) generalDesc = new PhoneNumberDesc.Builder().Build(); var phoneNumberDescList = countryElement.GetElementsByTagName(numberType); var numberDesc = new PhoneNumberDesc.Builder(); if (phoneNumberDescList.Count() == 0 && !IsValidNumberType(numberType)) { numberDesc.SetNationalNumberPattern("NA"); numberDesc.SetPossibleNumberPattern("NA"); return numberDesc.Build(); } numberDesc.MergeFrom(generalDesc); if (phoneNumberDescList.Count() > 0) { XElement element = phoneNumberDescList.First(); var possiblePattern = element.GetElementsByTagName(POSSIBLE_NUMBER_PATTERN); if (possiblePattern.Count() > 0) numberDesc.SetPossibleNumberPattern(ValidateRE(possiblePattern.First().Value, true)); var validPattern = element.GetElementsByTagName(NATIONAL_NUMBER_PATTERN); if (validPattern.Count() > 0) numberDesc.SetNationalNumberPattern(ValidateRE(validPattern.First().Value, true)); if (!liteBuild) { var exampleNumber = element.GetElementsByTagName(EXAMPLE_NUMBER); if (exampleNumber.Count() > 0) numberDesc.SetExampleNumber(exampleNumber.First().Value); } } return numberDesc.Build(); }
internal void PrepareArticleContentElement(XElement articleContentElement) { CleanStyles(articleContentElement); KillBreaks(articleContentElement); /* Clean out junk from the article content. */ Clean(articleContentElement, "form"); Clean(articleContentElement, "object"); Clean(articleContentElement, "h1"); /* If there is only one h2, they are probably using it as a header and not a subheader, * so remove it since we already have a header. */ if (articleContentElement.GetElementsByTagName("h2").Count() == 1) { Clean(articleContentElement, "h2"); } Clean(articleContentElement, "iframe"); CleanHeaders(articleContentElement); /* Do these last as the previous stuff may have removed junk that will affect these. */ CleanConditionally(articleContentElement, "table"); CleanConditionally(articleContentElement, "ul"); CleanConditionally(articleContentElement, "div"); /* Remove extra paragraphs. */ var paraElements = articleContentElement.GetElementsByTagName("p"); var elementsToRemove = new List<XElement>(); foreach (var paraElement in paraElements) { string innerText = GetInnerText(paraElement, false); if (innerText.Length > 0) { continue; } int imgsCount = paraElement.GetElementsByTagName("img").Count(); if (imgsCount > 0) { continue; } int embedsCount = paraElement.GetElementsByTagName("embed").Count(); if (embedsCount > 0) { continue; } int objectsCount = paraElement.GetElementsByTagName("object").Count(); if (objectsCount > 0) { continue; } // We have a paragraph with empty inner text, with no images, no embeds and no objects. // Let's remove it. elementsToRemove.Add(paraElement); } RemoveElements(elementsToRemove); /* Remove br's that are directly before paragraphs. */ articleContentElement.SetInnerHtml(_BreakBeforeParagraphRegex.Replace(articleContentElement.GetInnerHtml(), "<p")); }
public static void SetLeadingDigitsPatterns(XElement numberFormatElement, NumberFormat.Builder format) { foreach (XElement e in numberFormatElement.GetElementsByTagName(LEADING_DIGITS)) { format.AddLeadingDigitsPattern(ValidateRE(e.Value, true)); } }
/** * Extracts the available formats from the provided DOM element. If it does not contain any * nationalPrefixFormattingRule, the one passed-in is retained. The nationalPrefix, * nationalPrefixFormattingRule and nationalPrefixOptionalWhenFormatting values are provided from * the parent (territory) element. */ // @VisibleForTesting public static void LoadAvailableFormats(PhoneMetadata.Builder metadata, XElement element, String nationalPrefix, String nationalPrefixFormattingRule, bool nationalPrefixOptionalWhenFormatting) { String carrierCodeFormattingRule = ""; if (element.HasAttribute(CARRIER_CODE_FORMATTING_RULE)) { carrierCodeFormattingRule = ValidateRE( GetDomesticCarrierCodeFormattingRuleFromElement(element, nationalPrefix)); } var numberFormatElements = element.GetElementsByTagName(NUMBER_FORMAT); bool hasExplicitIntlFormatDefined = false; int numOfFormatElements = numberFormatElements.Count(); if (numOfFormatElements > 0) { foreach (XElement numberFormatElement in numberFormatElements) { var format = new NumberFormat.Builder(); if (numberFormatElement.HasAttribute(NATIONAL_PREFIX_FORMATTING_RULE)) { format.SetNationalPrefixFormattingRule( GetNationalPrefixFormattingRuleFromElement(numberFormatElement, nationalPrefix)); format.SetNationalPrefixOptionalWhenFormatting( numberFormatElement.HasAttribute(NATIONAL_PREFIX_OPTIONAL_WHEN_FORMATTING)); } else { format.SetNationalPrefixFormattingRule(nationalPrefixFormattingRule); format.SetNationalPrefixOptionalWhenFormatting(nationalPrefixOptionalWhenFormatting); } if (numberFormatElement.HasAttribute("carrierCodeFormattingRule")) { format.SetDomesticCarrierCodeFormattingRule(ValidateRE( GetDomesticCarrierCodeFormattingRuleFromElement( numberFormatElement, nationalPrefix))); } else { format.SetDomesticCarrierCodeFormattingRule(carrierCodeFormattingRule); } // Extract the pattern for the national format. String nationalFormat = LoadNationalFormat(metadata, numberFormatElement, format); metadata.AddNumberFormat(format); if (LoadInternationalFormat(metadata, numberFormatElement, nationalFormat)) { hasExplicitIntlFormatDefined = true; } } // Only a small number of regions need to specify the intlFormats in the xml. For the majority // of countries the intlNumberFormat metadata is an exact copy of the national NumberFormat // metadata. To minimize the size of the metadata file, we only keep intlNumberFormats that // actually differ in some way to the national formats. if (!hasExplicitIntlFormatDefined) { metadata.ClearIntlNumberFormat(); } } }
/** * Extracts the pattern for the national format. * * @throws RuntimeException if multiple or no formats have been encountered. * @return the national format string. */ // @VisibleForTesting public static String LoadNationalFormat(PhoneMetadata.Builder metadata, XElement numberFormatElement, NumberFormat.Builder format) { SetLeadingDigitsPatterns(numberFormatElement, format); format.SetPattern(ValidateRE(numberFormatElement.GetAttribute(PATTERN))); var formatPattern = numberFormatElement.GetElementsByTagName(FORMAT); if (formatPattern.Count() != 1) { //LOGGER.log(Level.SEVERE, // "Only one format pattern for a numberFormat element should be defined."); throw new Exception("Invalid number of format patterns for country: " + metadata.Id); } String nationalFormat = formatPattern.First().Value; format.SetFormat(nationalFormat); return nationalFormat; }
/** * Extracts the pattern for international format. If there is no intlFormat, default to using the * national format. If the intlFormat is set to "NA" the intlFormat should be ignored. * * @throws RuntimeException if multiple intlFormats have been encountered. * @return whether an international number format is defined. */ // @VisibleForTesting public static bool LoadInternationalFormat(PhoneMetadata.Builder metadata, XElement numberFormatElement, String nationalFormat) { NumberFormat.Builder intlFormat = new NumberFormat.Builder(); SetLeadingDigitsPatterns(numberFormatElement, intlFormat); intlFormat.SetPattern(numberFormatElement.GetAttribute(PATTERN)); var intlFormatPattern = numberFormatElement.GetElementsByTagName(INTL_FORMAT); bool hasExplicitIntlFormatDefined = false; if (intlFormatPattern.Count() > 1) { //LOGGER.log(Level.SEVERE, // "A maximum of one intlFormat pattern for a numberFormat element should be " + // "defined."); throw new Exception("Invalid number of intlFormat patterns for country: " + metadata.Id); } else if (intlFormatPattern.Count() == 0) { // Default to use the same as the national pattern if none is defined. intlFormat.SetFormat(nationalFormat); } else { String intlFormatPatternValue = intlFormatPattern.First().Value; if (!intlFormatPatternValue.Equals("NA")) { intlFormat.SetFormat(intlFormatPatternValue); } hasExplicitIntlFormatDefined = true; } if (intlFormat.HasFormat) { metadata.AddIntlNumberFormat(intlFormat); } return hasExplicitIntlFormatDefined; }
using System;