/// <summary> /// 媒体名称节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenMedia) { continue; } double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查结尾 if (Regex.IsMatch(Text, @"[报台网刊]")) { if (Posibility > 0) { Posibility = Math.Max(1, Posibility * 1.5); } else { Posibility = 0.8; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen : Strategy.List_BestAvgMediaLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0) { Posibility *= 0.6; } //英文出现,再6折 if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } //论坛媒体减分 if (Strategy.MediaType == Enums.MediaType.Forum) { AvgPossibility /= 2; } return(AvgPossibility / Nodes.Count()); }