/// <summary> /// Author节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenAuthor) { continue; } double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊人名,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者"); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); //在第一个空格处截断 int SpaceIndex = Text.IndexOf(' '); if (SpaceIndex > 0) { Text = Text.Substring(0, SpaceIndex); } Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查Top10姓氏命中 if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]")) { if (Posibility > 0) { Posibility *= 1.1; } else { Posibility = 0.6; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //ID或Class中出现Reply则为0(只要主贴作者) IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched); if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了 { Posibility = 0; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen : Strategy.List_BestAvgAuthorLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //论坛出现英文字母、数字或下划线,+50%可能性 if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase)) { Posibility *= 1.5; } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } return(AvgPossibility / Nodes.Count()); }
/// <summary> /// 媒体名称节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenMedia) { continue; } double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查结尾 if (Regex.IsMatch(Text, @"[报台网刊]")) { if (Posibility > 0) { Posibility = Math.Max(1, Posibility * 1.5); } else { Posibility = 0.8; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen : Strategy.List_BestAvgMediaLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0) { Posibility *= 0.6; } //英文出现,再6折 if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } //论坛媒体减分 if (Strategy.MediaType == Enums.MediaType.Forum) { AvgPossibility /= 2; } return(AvgPossibility / Nodes.Count()); }