internal XElement ExtractArticleTitle(XDocument document) { var documentBody = GetOrCreateBody(document); string documentTitle = document.GetTitle() ?? ""; string currentTitle = documentTitle; if (_ArticleTitleDashRegex1.IsMatch(currentTitle)) { currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1"); } } else if (currentTitle.IndexOf(": ") != -1) { currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1"); } } else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength) { var levelOneHeaders = documentBody.GetElementsByTagName("h1"); if (levelOneHeaders.Count() == 1) { currentTitle = GetInnerText(levelOneHeaders.First()); } } currentTitle = (currentTitle ?? "").Trim(); if (currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2) { currentTitle = documentTitle; } if (string.IsNullOrEmpty(currentTitle)) { return null; } var articleTitleElement = new XElement("h1"); articleTitleElement.SetInnerHtml(currentTitle); return articleTitleElement; }
internal XElement ExtractArticleTitle(XDocument document) { XElement documentBody = GetOrCreateBody(document); string documentTitle = document.GetTitle() ?? ""; string currentTitle = documentTitle; var a1 = currentTitle.Split(new char[] {'|', '_', '-'}, StringSplitOptions.RemoveEmptyEntries); if (a1.Length > 1) { currentTitle = a1[0]; } else { var b1 = currentTitle.Split(new char[] {':', ':'}, StringSplitOptions.RemoveEmptyEntries); if (b1.Length > 1) currentTitle = b1.Last(); } //如果当前获取的标题太短了,就用h1或者h2替换 if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength) { List<XElement> titleHeaders = documentBody.GetElementsByTagName("h1").ToList(); if (titleHeaders.Count == 0) { // if we don't have any level one headers let's give level two header a chance titleHeaders = documentBody.GetElementsByTagName("h2").ToList(); } if (titleHeaders.Count == 1) { currentTitle = GetInnerText(titleHeaders[0]); } } currentTitle = (currentTitle ?? "").Trim(); //标题太短,所以不进行处理 if (!string.IsNullOrEmpty(documentTitle) && NotAsciiCharNorBlanksCount(currentTitle) <= _MinArticleTitleWordsCount2) { currentTitle = documentTitle; } if (string.IsNullOrEmpty(currentTitle)) { return null; } var articleTitleElement = new XElement("h1") {Value = currentTitle}; return articleTitleElement; }
internal XElement ExtractArticleTitle(XDocument document) { XElement documentBody = GetOrCreateBody(document); string documentTitle = document.GetTitle() ?? ""; string currentTitle = documentTitle; if (_ArticleTitleDashRegex1.IsMatch(currentTitle)) { currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1"); } } else if (currentTitle.IndexOf(": ") != -1) { currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1"); } } else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength) { List<XElement> titleHeaders = documentBody.GetElementsByTagName("h1").ToList(); if (titleHeaders.Count == 0) { // if we don't have any level one headers let's give level two header a chance titleHeaders = documentBody.GetElementsByTagName("h2").ToList(); } if (titleHeaders.Count == 1) { currentTitle = GetInnerText(titleHeaders[0]); } } currentTitle = (currentTitle ?? "").Trim(); if (!string.IsNullOrEmpty(documentTitle) && currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2) { currentTitle = documentTitle; } if (string.IsNullOrEmpty(currentTitle)) { return null; } var articleTitleElement = new XElement("h1"); articleTitleElement.SetInnerHtml(currentTitle); return articleTitleElement; }
using System;