Ejemplo n.º 1
0
        /// <summary>
        /// 验证标题是否合法
        /// </summary>
        /// <param name="Title"></param>
        /// <returns></returns>
        public bool ValidateTitle(string Title)
        {
            if (string.IsNullOrWhiteSpace(Title))
            {
                return(false);
            }
            string CleanTitle = TextCleaner.FullClean(Title);

            switch (Language)
            {
            default:
            case Enums.Language.CHINESE:
                //中文:标题长度够长,且数字字符占比不超
                return((MinLenTitle <= 0 || CleanTitle.Length >= MinLenTitle) &&
                       (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle)));

            case Enums.Language.ENGLISH:
                //英文:标题单词够多,且数字字符占比不超
                return(MinWordCountTitle <= 0 || CleanTitle.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Length > MinWordCountTitle &&
                       (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle)));
            }
        }
Ejemplo n.º 2
0
        public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature)
        {
            if (Nodes == null || ItemCount == 0)
            {
                return(null);
            }
            Feature feature = new Feature(0);

            feature.FigureFeatures["ItemCount"] = Nodes.Count();
            int[]    TextLen = new int[Nodes.Count()];
            int[]    DigiLen = new int[Nodes.Count()];
            double[] Diff    = new double[Nodes.Count()];
            int      i       = 0;

            int[] intone      = new int[Nodes.Count()];
            bool  havetwonums = true;
            int   DigitCount  = 0;

            foreach (HtmlNode node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node));
                TextLen[i] = Text.Length;
                DigiLen[i] = TextCleaner.CountDigitChars(Text);
                if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1)
                {
                    if (Text.Contains("秒前") && Text.Length < 5)
                    {
                        Text = "昨日";
                    }
                    DateTime?Val = DateTimeParser.Parser(Text);
                    if (Val != null)
                    {
                        double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays);
                        if (diff < 4096 && Text.Length < Threshold.MaxDateLength)
                        {
                            Diff[i] = diff;
                            feature.FigureFeatures["DateParseCount"] += 1;
                        }
                    }
                }
                string          Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false);
                MatchCollection digiText     = Regex.Matches(Textfordigit, @"\d{1,9}");
                switch (digiText.Count)
                {
                case 1:
                    DigitCount++;
                    intone[i]   = int.Parse(digiText[0].Captures[0].Value);
                    havetwonums = false;
                    break;

                case 2:
                    DigitCount++;
                    intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value);
                    break;

                default:
                    havetwonums = false;
                    break;
                }
                if (TextLen.Sum() < 50)
                {
                    feature = CheckforChars(feature, Text, stencilfeature, false);
                }
                i += 1;
            }

            //ID和CLASS NAME的识别
            feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false);
            if (stencilfeature.FigureFeatures["DigitCountRate"] == 1)
            {
                feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount;
            }
            if (stencilfeature.FigureFeatures["AvgTextLen"] == 1)
            {
                feature.FigureFeatures["AvgTextLen"] = TextLen.Average();
            }
            if (stencilfeature.FigureFeatures["AllTextLen"] == 1)
            {
                feature.FigureFeatures["AllTextLen"] = TextLen.Sum();
            }
            if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1)
            {
                feature.FigureFeatures["AvgDateDistance"] = Diff.Average();
            }
            intone = intone.Where(inton => inton > 0).ToArray();
            if (stencilfeature.FigureFeatures["AvgNumber"] == 1)
            {
                feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2);
            }
            if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0)
            {
                feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount;
            }
            if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1)
            {
                feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum());
            }
            if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1)
            {
                feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1;
            }

            //曾经考虑过把数字特征的方差也统计进来,或者把标准差与平均值之比放进来。有用吗
            return(feature);
        }