/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath)) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } string innerTextValue = ""; try { HtmlNodeNavigator navigator = (HtmlNodeNavigator)BaseNode.CreateNavigator(); var node = navigator.SelectSingleNode(RelXPath); innerTextValue = node.Value; } catch (Exception ex) { } if (string.IsNullOrWhiteSpace(innerTextValue)) { IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() == 0)) { return(null); } innerTextValue = XPathUtility.InnerTextNonDescendants(MatchNodes.First()); } if (CleanConnectionMark) { return(TextCleaner.FullClean(innerTextValue)); } else { return(TextCleaner.FullClean(innerTextValue, true, true, true, false, true, false)); } }
/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath) && postion == 0) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() <= postion)) { return(null); } if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)), true, true, true, false, true, false)); } }
/// <summary> /// 验证标题是否合法 /// </summary> /// <param name="Title"></param> /// <returns></returns> public bool ValidateTitle(HtmlNode Title) { return(ValidateTitle(XPathUtility.InnerTextNonDescendants(Title)) && !(Title.Attributes["href"] == null) && HTMLCleaner.isUrlGood(Title.Attributes["href"].Value)); }
/// <summary> /// Author节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenAuthor) { continue; } double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊人名,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者"); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); //在第一个空格处截断 int SpaceIndex = Text.IndexOf(' '); if (SpaceIndex > 0) { Text = Text.Substring(0, SpaceIndex); } Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查Top10姓氏命中 if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]")) { if (Posibility > 0) { Posibility *= 1.1; } else { Posibility = 0.6; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //ID或Class中出现Reply则为0(只要主贴作者) IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched); if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了 { Posibility = 0; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen : Strategy.List_BestAvgAuthorLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //论坛出现英文字母、数字或下划线,+50%可能性 if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase)) { Posibility *= 1.5; } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } return(AvgPossibility / Nodes.Count()); }
/// <summary> /// 媒体名称节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenMedia) { continue; } double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查结尾 if (Regex.IsMatch(Text, @"[报台网刊]")) { if (Posibility > 0) { Posibility = Math.Max(1, Posibility * 1.5); } else { Posibility = 0.8; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen : Strategy.List_BestAvgMediaLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0) { Posibility *= 0.6; } //英文出现,再6折 if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } //论坛媒体减分 if (Strategy.MediaType == Enums.MediaType.Forum) { AvgPossibility /= 2; } return(AvgPossibility / Nodes.Count()); }
/// <summary> /// ViewReply节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <param name="Contain2Numbers">元素中是否包含了2个数字</param> /// <param name="MustBeReply">是否一定是评论(class或内容有明确指示的情况)</param> /// <param name="AvgNumber">平均数,用于比较是View或Reply;如果在一个Element中有两个数字,则为长度2的数组,均值大的是第一个</param> /// <returns></returns> internal static double ViewNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName, out bool Contain2Numbers, out bool MustBeReply, out double[] AvgNumber) { Contain2Numbers = false; MustBeReply = false; AvgNumber = null; //平均字符密度的检查 double AvgCharFreq = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false); if (Text.Length > Strategy.MaxLenView) { continue; } //检查是否命中日期的特殊字符 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustDate, false) > 0) { return(0); } //检查字符集密度, 不合格直接可能性为0 int CharCount = TextCleaner.CountCharInSet(Text, FrequentCharSet_ViewReply); AvgCharFreq += CharCount / (double)Text.Length; } AvgCharFreq = AvgCharFreq / Nodes.Count(); if (AvgCharFreq < FrequentChars_Min || AvgCharFreq > FrequentChars_Max) { return(0); } //尝试提取其中的连续数字再解析,统计每个Text有几个数字,及其极值分布 int TotalCount = 0; int ParsedNode = 0; List <int> Ints0 = new List <int>(BaseItemCount); List <int> Ints1 = new List <int>(BaseItemCount); int ViewNameMatched = 0; double IDClassScore = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false); if (Text.Length > Strategy.MaxLenView) { continue; } MatchCollection digiText = Regex.Matches(Text, @"\d{1,9}"); if (digiText.Count == 0) { continue; } //数量超过2则不可能 if (digiText.Count > 2) { return(0); } TotalCount += digiText.Count; //记录每一个Parse出来的int int Val0 = int.Parse(digiText[0].Captures[0].Value); Ints0.Add(Val0); if (digiText.Count > 1) { Ints1.Add(int.Parse(digiText[1].Captures[0].Value)); } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double s = IDClassNameScore(Node, ViewClassNames_Must, ViewClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { IDClassScore += s; ViewNameMatched++; if (s > 0) //正面命中的话,看看是不是reply { IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched); } if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了 { MustBeReply = true; } } //根据内容关键词来判断是否MustBeReply foreach (string ReplyKeyword in ReplyWords) { if (Text.Contains(ReplyKeyword)) { MustBeReply = true; } } ParsedNode++; } //标志本元素包含两个数字 if (Ints1.Count > 0 && Ints1.Count == Ints0.Count) { AvgNumber = new double[2]; double Sum0 = Ints0.Sum(); double Sum1 = Ints1.Sum(); AvgNumber[0] = (Sum0 > Sum1 ? Sum0 : Sum1) / ParsedNode; AvgNumber[1] = (Sum0 > Sum1 ? Sum1 : Sum0) / ParsedNode; Contain2Numbers = true; } else if (Ints0.Count > 0 && ParsedNode > 0) { AvgNumber = new double[1]; AvgNumber[0] = Ints0.Sum() / ParsedNode; } else { AvgCharFreq = 0; } //如果每个Text只有一个数字,加分20%-50% if (TotalCount == Nodes.Count()) { AvgCharFreq *= (Ints0.Max() <= 31) ? 1.2 : 1.5; } else if (Ints0.Count > 0) { //有两个数字的检查极值 if (Ints0.Max() <= 12 && Ints1.Max() <= 31 || Ints0.Max() <= 31 && Ints1.Max() <= 12) { AvgCharFreq *= 0.8; } else //排除日期可能,得分翻倍 { AvgCharFreq *= 2; } } //根据class和id是否有特殊字样加权 if (ViewNameMatched > 0) { IDClassScore /= ViewNameMatched; AvgCharFreq = (AvgCharFreq == 0 ? 1 : AvgCharFreq) * IDClassScore; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgCharFreq *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgCharFreq *= 1.2; } return(AvgCharFreq); }
/// <summary> /// 日期节点Pattern的评估函数(考虑时差和数量,可以理解为0-1之间的可能性数值,越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double DateNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { //直接用DateTimeParser.Parser了,所以不用考察字符密度了 double SumDiff = 0; int ParseCount = 0; //所有日期和Now的差距综合 foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true); if (Text.Length > Strategy.MaxLenDate) { continue; } DateTime?Val = DateTimeParser.Parser(Text); if (Val != null) { ParseCount++; DateTime d = (DateTime)Val; //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, DateClassNames_Must, DateClassNames_MustNot, out IDClassNameMatched); if (d.Hour == 0 && d.Minute == 0 && d.Second == 0) { //只有日期,得分为距离Now的时差 SumDiff += Math.Abs((DateTime.Now - d).TotalDays / (IDClassNameMatched ? IDClassScore : 1)); } else { //有精确时间,则只计入一半的时差 SumDiff += Math.Abs((DateTime.Now - d).TotalDays / 2 / (IDClassNameMatched ? IDClassScore : 1)); } } } double Possibility = 0; //先计算数量的可能性,在1/3-3倍之间都可以,可能性等比递减 if (ParseCount * 3 < BaseItemCount || BaseItemCount * 3 < ParseCount) { return(0); } if (ParseCount >= BaseItemCount) { Possibility = 1 / (ParseCount / BaseItemCount); //倍数的倒数 } else { Possibility = 1 / (BaseItemCount / ParseCount); } //根据平均时差来计算可能性 double AvgDiff = SumDiff / ParseCount; if (AvgDiff > 180) //180以后统一为30% { Possibility *= 0.3; } else if (AvgDiff > 7) //1周以后为30%-100%,等比递减 { Possibility *= (0.3 + 0.004046 * (180 - AvgDiff)); } else if (AvgDiff > 1) //1周以内100%-150% { Possibility *= (1 + 0.08333 * (7 - AvgDiff)); } else //1天以内的,150%-200%,便于区分原帖和评论的时间 { Possibility *= (1.5 + 0.5 * (1 - AvgDiff)); } //微调1:级别约小的越好,2级以上每多一个级别扣20%可能性 if (PathLevel > 1) { Possibility *= (1 - 0.2 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { Possibility *= 1.2; } return(Possibility); }
public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature) { if (Nodes == null || ItemCount == 0) { return(null); } Feature feature = new Feature(0); feature.FigureFeatures["ItemCount"] = Nodes.Count(); int[] TextLen = new int[Nodes.Count()]; int[] DigiLen = new int[Nodes.Count()]; double[] Diff = new double[Nodes.Count()]; int i = 0; int[] intone = new int[Nodes.Count()]; bool havetwonums = true; int DigitCount = 0; foreach (HtmlNode node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node)); TextLen[i] = Text.Length; DigiLen[i] = TextCleaner.CountDigitChars(Text); if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1) { if (Text.Contains("秒前") && Text.Length < 5) { Text = "昨日"; } DateTime?Val = DateTimeParser.Parser(Text); if (Val != null) { double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays); if (diff < 4096 && Text.Length < Threshold.MaxDateLength) { Diff[i] = diff; feature.FigureFeatures["DateParseCount"] += 1; } } } string Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false); MatchCollection digiText = Regex.Matches(Textfordigit, @"\d{1,9}"); switch (digiText.Count) { case 1: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value); havetwonums = false; break; case 2: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value); break; default: havetwonums = false; break; } if (TextLen.Sum() < 50) { feature = CheckforChars(feature, Text, stencilfeature, false); } i += 1; } //ID和CLASS NAME的识别 feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false); if (stencilfeature.FigureFeatures["DigitCountRate"] == 1) { feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount; } if (stencilfeature.FigureFeatures["AvgTextLen"] == 1) { feature.FigureFeatures["AvgTextLen"] = TextLen.Average(); } if (stencilfeature.FigureFeatures["AllTextLen"] == 1) { feature.FigureFeatures["AllTextLen"] = TextLen.Sum(); } if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1) { feature.FigureFeatures["AvgDateDistance"] = Diff.Average(); } intone = intone.Where(inton => inton > 0).ToArray(); if (stencilfeature.FigureFeatures["AvgNumber"] == 1) { feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2); } if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0) { feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount; } if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1) { feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum()); } if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1) { feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1; } //曾经考虑过把数字特征的方差也统计进来,或者把标准差与平均值之比放进来。有用吗 return(feature); }