/// <summary> /// 验证标题是否合法 /// </summary> /// <param name="Title"></param> /// <returns></returns> public bool ValidateTitle(string Title) { if (string.IsNullOrWhiteSpace(Title)) { return(false); } string CleanTitle = TextCleaner.FullClean(Title); switch (Language) { default: case Enums.Language.CHINESE: //中文:标题长度够长,且数字字符占比不超 return((MinLenTitle <= 0 || CleanTitle.Length >= MinLenTitle) && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); case Enums.Language.ENGLISH: //英文:标题单词够多,且数字字符占比不超 return(MinWordCountTitle <= 0 || CleanTitle.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Length > MinWordCountTitle && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); } }
public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature) { if (Nodes == null || ItemCount == 0) { return(null); } Feature feature = new Feature(0); feature.FigureFeatures["ItemCount"] = Nodes.Count(); int[] TextLen = new int[Nodes.Count()]; int[] DigiLen = new int[Nodes.Count()]; double[] Diff = new double[Nodes.Count()]; int i = 0; int[] intone = new int[Nodes.Count()]; bool havetwonums = true; int DigitCount = 0; foreach (HtmlNode node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node)); TextLen[i] = Text.Length; DigiLen[i] = TextCleaner.CountDigitChars(Text); if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1) { if (Text.Contains("秒前") && Text.Length < 5) { Text = "昨日"; } DateTime?Val = DateTimeParser.Parser(Text); if (Val != null) { double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays); if (diff < 4096 && Text.Length < Threshold.MaxDateLength) { Diff[i] = diff; feature.FigureFeatures["DateParseCount"] += 1; } } } string Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false); MatchCollection digiText = Regex.Matches(Textfordigit, @"\d{1,9}"); switch (digiText.Count) { case 1: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value); havetwonums = false; break; case 2: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value); break; default: havetwonums = false; break; } if (TextLen.Sum() < 50) { feature = CheckforChars(feature, Text, stencilfeature, false); } i += 1; } //ID和CLASS NAME的识别 feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false); if (stencilfeature.FigureFeatures["DigitCountRate"] == 1) { feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount; } if (stencilfeature.FigureFeatures["AvgTextLen"] == 1) { feature.FigureFeatures["AvgTextLen"] = TextLen.Average(); } if (stencilfeature.FigureFeatures["AllTextLen"] == 1) { feature.FigureFeatures["AllTextLen"] = TextLen.Sum(); } if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1) { feature.FigureFeatures["AvgDateDistance"] = Diff.Average(); } intone = intone.Where(inton => inton > 0).ToArray(); if (stencilfeature.FigureFeatures["AvgNumber"] == 1) { feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2); } if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0) { feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount; } if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1) { feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum()); } if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1) { feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1; } //曾经考虑过把数字特征的方差也统计进来,或者把标准差与平均值之比放进来。有用吗 return(feature); }