TextCleaner.FullClean C# (CSharp) Exemples de code

Exemple #1

0

Afficher le fichier

Fichier : CrawlSina.cs Projet : Ailsa0910026911/Palas

        private Tweet[] FillUserTweet(UserTweet result, string content)
        {
            var          matches   = Regex.Matches(content, RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase);
            List <Tweet> tweetList = new List <Tweet>();

            try
            {
                foreach (Match match in matches)
                {
                    Tweet tweet = new Tweet();
                    int   comment;
                    int.TryParse(match.Groups["Reply"].Value, out comment);
                    int forward;
                    int.TryParse(match.Groups["Forward"].Value, out forward);
                    tweet.Comment = comment;
                    tweet.Content = TextCleaner.FullClean(match.Groups["Content"].Value);
                    tweet.Mid     = match.Groups["Mid"].Value;
                    tweet.Forward = forward;
                    tweet.Source  = match.Groups["Source"].Value;
                    tweet.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                    tweet.Url     = RegexParser.AbsoluteUrl(match.Groups["Url"].Value, result.Url, true);
                    result.Tweets.Add(tweet);
                    tweetList.Add(tweet);
                }
            }
            catch {}

            return(tweetList.ToArray());
        }

Exemple #2

0

Afficher le fichier

        private void CrawlDailyReport(Worksheet dailyWorksheet, Workbook dailybook, ref int dailyStartRow, string categoryName,
                                      string[] categoryUrls)
        {
            bool isFirst = true;

            foreach (string url in categoryUrls)
            {
                var dailycontent = WebRequestProcessor.DownloadHTTPString(url);
                Thread.Sleep(2000);
                var dailyMatches = Regex.Matches(dailycontent, baiduRegex,
                                                 RegexOptions.IgnoreCase | RegexOptions.Multiline);
                foreach (Match dailyMatch in dailyMatches)
                {
                    if (!dailyMatch.Groups["PubDate"].Value.Contains("前"))
                    {
                        continue;
                    }
                    if (isFirst)
                    {
                        dailyWorksheet.Cells[dailyStartRow, 2].PutValue(categoryName);
                        isFirst = false;
                    }
                    var resultUrl = dailyMatch.Groups["Url"].Value;
                    try
                    {
                        Uri uri    = new Uri(resultUrl);
                        var domain = GetUrlDomain(uri.Host);
                        //匹配媒体名
                        dailyWorksheet.Cells[dailyStartRow, 1].PutValue(domain);
                    }
                    catch (Exception)
                    {
                    }

                    var title = TextCleaner.FullClean(dailyMatch.Groups["Title"].Value) + Environment.NewLine +
                                TextCleaner.FullClean(dailyMatch.Groups["Text"].Value);
                    var colorstyle = dailyWorksheet.Cells[dailyStartRow, 6].GetDisplayStyle();
                    colorstyle.Font.Color = Color.Blue;
                    var currentExcelRow = dailyStartRow + 1;
                    dailyWorksheet.Cells[dailyStartRow, 0].PutValue(resultUrl);
                    dailyWorksheet.Cells[dailyStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)";


                    dailyWorksheet.Cells[dailyStartRow, 6].SetStyle(colorstyle);
                    dailyWorksheet.Cells[dailyStartRow, 6].PutValue(title);


                    dailyWorksheet.Hyperlinks.Add(dailyStartRow, 6, 1, 1, resultUrl);
                    dailyWorksheet.Cells[dailyStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd"));
                    dailyWorksheet.Cells[dailyStartRow, 8].PutValue("负面舆情");
                    dailyStartRow++;
                }
            }
            dailybook.Save(@"D:\dailyreport\日报.xlsx");
        }

Exemple #3

0

Afficher le fichier

        /// <summary>
        /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText
        /// </summary>
        /// <param name="BaseNode">一个Item的根节点</param>
        /// <param name="RelXPath">相对XPath路径</param>
        /// <param name="CleanConnectionMark">是否清洗文本</param>
        /// <returns></returns>
        internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true)
        {
            if (BaseNode == null)
            {
                return(null);
            }

            if (string.IsNullOrWhiteSpace(RelXPath))
            {
                if (CleanConnectionMark)
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode)));
                }
                else
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false));
                }
            }

            string innerTextValue = "";

            try
            {
                HtmlNodeNavigator navigator = (HtmlNodeNavigator)BaseNode.CreateNavigator();
                var node = navigator.SelectSingleNode(RelXPath);
                innerTextValue = node.Value;
            }
            catch (Exception ex)
            { }

            if (string.IsNullOrWhiteSpace(innerTextValue))
            {
                IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath);
                if (MatchNodes != null)
                {
                    MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n)));
                }
                if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() == 0))
                {
                    return(null);
                }

                innerTextValue = XPathUtility.InnerTextNonDescendants(MatchNodes.First());
            }

            if (CleanConnectionMark)
            {
                return(TextCleaner.FullClean(innerTextValue));
            }
            else
            {
                return(TextCleaner.FullClean(innerTextValue, true, true, true, false, true, false));
            }
        }

Exemple #4

0

Afficher le fichier

Fichier : RegexParser.cs Projet : mlzboy/list_discovery

        /// <summary>
        /// Match2s the item.
        /// </summary>
        /// <param name="m">M.</param>
        /// <param name="Item">Item.</param>
        /// <param name="BaseUrl">Base URL.</param>
        /// <param name="ItemUrlCaseSensitive">If set to <c>true</c> item URL case sensitive.</param>
        public static void Match2Item(Match m, ref Article Item, string BaseUrl, bool ItemUrlCaseSensitive = false)
        {
            //url
            Item.Url = new Uri(new Uri(BaseUrl), RegexUtility.TryGetString(m, "Url", Item.Url, false)).AbsoluteUri;

            //title
            Item.Title = RegexUtility.TryGetString(m, "Title", Item.Title);
            //降低Clean级别
            if (string.IsNullOrEmpty(Item.Title))
            {
                Item.Title = HTMLCleaner.CleanHTML(Item.Title, true);
            }

            //text
            Item.HtmlContent = RegexUtility.TryGetString(m, "Text", Item.HtmlContent, false);

            //Author Info
            Item.Author = RegexUtility.TryGetString(m, "AuthorName", Item.Author);
            Item.Source = RegexUtility.TryGetString(m, "Source", Item.Source);

            if (!String.IsNullOrWhiteSpace(Item.Source))
            {
                Item.Source = TextCleaner.FullClean(Item.Source);
            }

            //Media Info
            Item.MediaName = RegexUtility.TryGetString(m, "MediaName", Item.MediaName);
            //time


            if (m.Groups["PubDate"].Success)
            {
                Item.PubDate = DateTimeParser.Parser(HTMLCleaner.CleanHTML(m.Groups["PubDate"].Value, true));
            }

            if (Item.PubDate <= DateTime.MinValue)
            {
                Item.PubDate = DateTime.Now;
            }

            Match2ItemCount(m, Item.ViewDataList);
        }

Exemple #5

0

Afficher le fichier

        /// <summary>
        /// 验证标题是否合法
        /// </summary>
        /// <param name="Title"></param>
        /// <returns></returns>
        public bool ValidateTitle(string Title)
        {
            if (string.IsNullOrWhiteSpace(Title))
            {
                return(false);
            }
            string CleanTitle = TextCleaner.FullClean(Title);

            switch (Language)
            {
            default:
            case Enums.Language.CHINESE:
                //中文：标题长度够长，且数字字符占比不超
                return((MinLenTitle <= 0 || CleanTitle.Length >= MinLenTitle) &&
                       (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle)));

            case Enums.Language.ENGLISH:
                //英文：标题单词够多，且数字字符占比不超
                return(MinWordCountTitle <= 0 || CleanTitle.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Length > MinWordCountTitle &&
                       (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle)));
            }
        }

Exemple #6

0

Afficher le fichier

Fichier : XpathParser.cs Projet : mlzboy/list_discovery

        /// <summary>
        /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText
        /// </summary>
        /// <param name="BaseNode">一个Item的根节点</param>
        /// <param name="RelXPath">相对XPath路径</param>
        /// <param name="CleanConnectionMark">是否清洗文本</param>
        /// <returns></returns>
        internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true)
        {
            if (BaseNode == null)
            {
                return(null);
            }

            if (string.IsNullOrWhiteSpace(RelXPath) && postion == 0)
            {
                if (CleanConnectionMark)
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode)));
                }
                else
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false));
                }
            }

            IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath);

            if (MatchNodes != null)
            {
                MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n)));
            }
            if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() <= postion))
            {
                return(null);
            }

            if (CleanConnectionMark)
            {
                return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion))));
            }
            else
            {
                return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)), true, true, true, false, true, false));
            }
        }

Exemple #7

0

Afficher le fichier

        /// <summary>
        /// Author节点的打分函数（越大越好）
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenAuthor)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0;

                //检查是否命中日期或点击数的特殊字符，还不能全部排除，可能有特殊人名,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者");
                if (PrefixMatch.Success)
                {
                    Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', '：', ']', '】', ' ');

                    //在第一个空格处截断
                    int SpaceIndex = Text.IndexOf(' ');
                    if (SpaceIndex > 0)
                    {
                        Text = Text.Substring(0, SpaceIndex);
                    }

                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查Top10姓氏命中
                if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility *= 1.1;
                    }
                    else
                    {
                        Posibility = 0.6;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //ID或Class中出现Reply则为0（只要主贴作者）
                IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched);
                if (IDClassNameMatched) //有一个命中reply，这个标志就被设定了
                {
                    Posibility = 0;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen
                        : Strategy.List_BestAvgAuthorLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //论坛出现英文字母、数字或下划线，+50%可能性
                if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase))
                {
                    Posibility *= 1.5;
                }

                //非名称分隔符（包含空格）出现，6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1：级别约小的越好，2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2：用ID和class的好于用序号的，增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            return(AvgPossibility / Nodes.Count());
        }

Exemple #8

0

Afficher le fichier

        /// <summary>
        /// 媒体名称节点的打分函数（越大越好）
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenMedia)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0;

                //检查是否命中日期或点击数的特殊字符，还不能全部排除，可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex);
                if (PrefixMatch.Success)
                {
                    Text       = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', '：', ']', '】', ' ');
                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查结尾
                if (Regex.IsMatch(Text, @"[报台网刊]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility = Math.Max(1, Posibility * 1.5);
                    }
                    else
                    {
                        Posibility = 0.8;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen
                        : Strategy.List_BestAvgMediaLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //非名称分隔符（包含空格）出现，6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0)
                {
                    Posibility *= 0.6;
                }

                //英文出现，再6折
                if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1：级别约小的越好，2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2：用ID和class的好于用序号的，增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            //论坛媒体减分
            if (Strategy.MediaType == Enums.MediaType.Forum)
            {
                AvgPossibility /= 2;
            }

            return(AvgPossibility / Nodes.Count());
        }

Exemple #9

0

Afficher le fichier

        /// <summary>
        /// ViewReply节点的打分函数（越大越好）
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <param name="Contain2Numbers">元素中是否包含了2个数字</param>
        /// <param name="MustBeReply">是否一定是评论（class或内容有明确指示的情况）</param>
        /// <param name="AvgNumber">平均数，用于比较是View或Reply;如果在一个Element中有两个数字，则为长度2的数组，均值大的是第一个</param>
        /// <returns></returns>
        internal static double ViewNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName, out bool Contain2Numbers, out bool MustBeReply, out double[] AvgNumber)
        {
            Contain2Numbers = false;
            MustBeReply     = false;
            AvgNumber       = null;

            //平均字符密度的检查
            double AvgCharFreq = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false);
                if (Text.Length > Strategy.MaxLenView)
                {
                    continue;
                }

                //检查是否命中日期的特殊字符
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustDate, false) > 0)
                {
                    return(0);
                }
                //检查字符集密度， 不合格直接可能性为0
                int CharCount = TextCleaner.CountCharInSet(Text, FrequentCharSet_ViewReply);
                AvgCharFreq += CharCount / (double)Text.Length;
            }
            AvgCharFreq = AvgCharFreq / Nodes.Count();
            if (AvgCharFreq < FrequentChars_Min || AvgCharFreq > FrequentChars_Max)
            {
                return(0);
            }

            //尝试提取其中的连续数字再解析，统计每个Text有几个数字，及其极值分布
            int        TotalCount = 0;
            int        ParsedNode = 0;
            List <int> Ints0      = new List <int>(BaseItemCount);
            List <int> Ints1      = new List <int>(BaseItemCount);

            int    ViewNameMatched = 0;
            double IDClassScore    = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false);
                if (Text.Length > Strategy.MaxLenView)
                {
                    continue;
                }

                MatchCollection digiText = Regex.Matches(Text, @"\d{1,9}");

                if (digiText.Count == 0)
                {
                    continue;
                }
                //数量超过2则不可能
                if (digiText.Count > 2)
                {
                    return(0);
                }

                TotalCount += digiText.Count;
                //记录每一个Parse出来的int
                int Val0 = int.Parse(digiText[0].Captures[0].Value);
                Ints0.Add(Val0);
                if (digiText.Count > 1)
                {
                    Ints1.Add(int.Parse(digiText[1].Captures[0].Value));
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double s = IDClassNameScore(Node, ViewClassNames_Must, ViewClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    IDClassScore += s;
                    ViewNameMatched++;

                    if (s > 0) //正面命中的话，看看是不是reply
                    {
                        IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched);
                    }
                    if (IDClassNameMatched) //有一个命中reply，这个标志就被设定了
                    {
                        MustBeReply = true;
                    }
                }

                //根据内容关键词来判断是否MustBeReply
                foreach (string ReplyKeyword in ReplyWords)
                {
                    if (Text.Contains(ReplyKeyword))
                    {
                        MustBeReply = true;
                    }
                }

                ParsedNode++;
            }


            //标志本元素包含两个数字
            if (Ints1.Count > 0 && Ints1.Count == Ints0.Count)
            {
                AvgNumber = new double[2];
                double Sum0 = Ints0.Sum();
                double Sum1 = Ints1.Sum();
                AvgNumber[0] = (Sum0 > Sum1 ? Sum0 : Sum1) / ParsedNode;
                AvgNumber[1] = (Sum0 > Sum1 ? Sum1 : Sum0) / ParsedNode;

                Contain2Numbers = true;
            }
            else if (Ints0.Count > 0 && ParsedNode > 0)
            {
                AvgNumber    = new double[1];
                AvgNumber[0] = Ints0.Sum() / ParsedNode;
            }
            else
            {
                AvgCharFreq = 0;
            }

            //如果每个Text只有一个数字，加分20%-50%
            if (TotalCount == Nodes.Count())
            {
                AvgCharFreq *= (Ints0.Max() <= 31) ? 1.2 : 1.5;
            }
            else if (Ints0.Count > 0)
            {
                //有两个数字的检查极值
                if (Ints0.Max() <= 12 && Ints1.Max() <= 31 || Ints0.Max() <= 31 && Ints1.Max() <= 12)
                {
                    AvgCharFreq *= 0.8;
                }
                else //排除日期可能，得分翻倍
                {
                    AvgCharFreq *= 2;
                }
            }

            //根据class和id是否有特殊字样加权
            if (ViewNameMatched > 0)
            {
                IDClassScore /= ViewNameMatched;
                AvgCharFreq   = (AvgCharFreq == 0 ? 1 : AvgCharFreq) * IDClassScore;
            }

            //微调1：级别约小的越好，2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgCharFreq *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2：用ID和class的好于用序号的，增加20%可能性
            if (PathUsingName)
            {
                AvgCharFreq *= 1.2;
            }

            return(AvgCharFreq);
        }

Exemple #10

0

Afficher le fichier

        /// <summary>
        /// 日期节点Pattern的评估函数（考虑时差和数量，可以理解为0-1之间的可能性数值，越大越好）
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double DateNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            //直接用DateTimeParser.Parser了，所以不用考察字符密度了
            double SumDiff    = 0;
            int    ParseCount = 0;

            //所有日期和Now的差距综合
            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true);
                if (Text.Length > Strategy.MaxLenDate)
                {
                    continue;
                }

                DateTime?Val = DateTimeParser.Parser(Text);
                if (Val != null)
                {
                    ParseCount++;
                    DateTime d = (DateTime)Val;

                    //根据class和id是否有特殊字样加权
                    bool   IDClassNameMatched;
                    double IDClassScore = IDClassNameScore(Node, DateClassNames_Must, DateClassNames_MustNot, out IDClassNameMatched);

                    if (d.Hour == 0 && d.Minute == 0 && d.Second == 0)
                    {
                        //只有日期，得分为距离Now的时差
                        SumDiff += Math.Abs((DateTime.Now - d).TotalDays / (IDClassNameMatched ? IDClassScore : 1));
                    }
                    else
                    {
                        //有精确时间，则只计入一半的时差
                        SumDiff += Math.Abs((DateTime.Now - d).TotalDays / 2 / (IDClassNameMatched ? IDClassScore : 1));
                    }
                }
            }

            double Possibility = 0;

            //先计算数量的可能性，在1/3-3倍之间都可以，可能性等比递减
            if (ParseCount * 3 < BaseItemCount || BaseItemCount * 3 < ParseCount)
            {
                return(0);
            }
            if (ParseCount >= BaseItemCount)
            {
                Possibility = 1 / (ParseCount / BaseItemCount); //倍数的倒数
            }
            else
            {
                Possibility = 1 / (BaseItemCount / ParseCount);
            }

            //根据平均时差来计算可能性
            double AvgDiff = SumDiff / ParseCount;

            if (AvgDiff > 180) //180以后统一为30%
            {
                Possibility *= 0.3;
            }
            else if (AvgDiff > 7) //1周以后为30%-100%，等比递减
            {
                Possibility *= (0.3 + 0.004046 * (180 - AvgDiff));
            }
            else if (AvgDiff > 1) //1周以内100%-150%
            {
                Possibility *= (1 + 0.08333 * (7 - AvgDiff));
            }
            else //1天以内的，150%-200%，便于区分原帖和评论的时间
            {
                Possibility *= (1.5 + 0.5 * (1 - AvgDiff));
            }

            //微调1：级别约小的越好，2级以上每多一个级别扣20%可能性
            if (PathLevel > 1)
            {
                Possibility *= (1 - 0.2 * (PathLevel - 1));
            }

            //微调2：用ID和class的好于用序号的，增加20%可能性
            if (PathUsingName)
            {
                Possibility *= 1.2;
            }

            return(Possibility);
        }

Exemple #11

0

Afficher le fichier

Fichier : CrawlSina.cs Projet : Ailsa0910026911/Palas

        private void FillTweetComment(Tweet tweet, SiteEntity site)
        {
            if (tweet.Comment == 0)
            {
                return;
            }
            int    currentPage = 1;
            string mid         = tweet.Mid;

            try
            {
                while (true)
                {
                    string url = string.Format(CommentUrlFormat, mid, currentPage);

                    var request = BuildRequest(url);

                    CrawlResponse response = null;
                    for (int i = 0; i < 5; i++)
                    {
                        try
                        {
                            response = GeckoRequestProcessor.DoRequest(request, site, null, null);
                            AggrSum();
                        }
                        catch {}

                        if (response.Status != Enums.CrawlResult.Succ)
                        {
                            Logger.Info("访问页面错误:Url = " + response.Url);
                        }
                        else
                        {
                            break;
                        }
                    }
                    CommentJsonResponse tmpResult =
                        JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray()));
                    response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    var pageMatch = Regex.Match(response.Content, RegexCommentPage,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (currentPage != 1 &&
                        (!pageMatch.Success ||
                         pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture)))
                    {
                        return;
                    }
                    //Fill Tweet
                    var matches = Regex.Matches(response.Content, RegexComment,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    foreach (Match match in matches)
                    {
                        Comment comment = new Comment();
                        comment.Author    = match.Groups["Author"].Value;
                        comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true);
                        comment.Content   = TextCleaner.FullClean(match.Groups["Content"].Value);
                        comment.PubDate   = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                        tweet.Comments.Add(comment);
                    }

                    currentPage++;
                }
            }
            catch {
            }
        }

Exemple #12

0

Afficher le fichier

Fichier : SoftStrategy.cs Projet : mlzboy/list_discovery

        public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature)
        {
            if (Nodes == null || ItemCount == 0)
            {
                return(null);
            }
            Feature feature = new Feature(0);

            feature.FigureFeatures["ItemCount"] = Nodes.Count();
            int[]    TextLen = new int[Nodes.Count()];
            int[]    DigiLen = new int[Nodes.Count()];
            double[] Diff    = new double[Nodes.Count()];
            int      i       = 0;

            int[] intone      = new int[Nodes.Count()];
            bool  havetwonums = true;
            int   DigitCount  = 0;

            foreach (HtmlNode node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node));
                TextLen[i] = Text.Length;
                DigiLen[i] = TextCleaner.CountDigitChars(Text);
                if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1)
                {
                    if (Text.Contains("秒前") && Text.Length < 5)
                    {
                        Text = "昨日";
                    }
                    DateTime?Val = DateTimeParser.Parser(Text);
                    if (Val != null)
                    {
                        double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays);
                        if (diff < 4096 && Text.Length < Threshold.MaxDateLength)
                        {
                            Diff[i] = diff;
                            feature.FigureFeatures["DateParseCount"] += 1;
                        }
                    }
                }
                string          Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false);
                MatchCollection digiText     = Regex.Matches(Textfordigit, @"\d{1,9}");
                switch (digiText.Count)
                {
                case 1:
                    DigitCount++;
                    intone[i]   = int.Parse(digiText[0].Captures[0].Value);
                    havetwonums = false;
                    break;

                case 2:
                    DigitCount++;
                    intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value);
                    break;

                default:
                    havetwonums = false;
                    break;
                }
                if (TextLen.Sum() < 50)
                {
                    feature = CheckforChars(feature, Text, stencilfeature, false);
                }
                i += 1;
            }

            //ID和CLASS NAME的识别
            feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false);
            if (stencilfeature.FigureFeatures["DigitCountRate"] == 1)
            {
                feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount;
            }
            if (stencilfeature.FigureFeatures["AvgTextLen"] == 1)
            {
                feature.FigureFeatures["AvgTextLen"] = TextLen.Average();
            }
            if (stencilfeature.FigureFeatures["AllTextLen"] == 1)
            {
                feature.FigureFeatures["AllTextLen"] = TextLen.Sum();
            }
            if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1)
            {
                feature.FigureFeatures["AvgDateDistance"] = Diff.Average();
            }
            intone = intone.Where(inton => inton > 0).ToArray();
            if (stencilfeature.FigureFeatures["AvgNumber"] == 1)
            {
                feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2);
            }
            if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0)
            {
                feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount;
            }
            if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1)
            {
                feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum());
            }
            if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1)
            {
                feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1;
            }

            //曾经考虑过把数字特征的方差也统计进来，或者把标准差与平均值之比放进来。有用吗
            return(feature);
        }

Exemple #13

0

Afficher le fichier

        private void CrawlBtn_Click(object sender, EventArgs e)
        {
            //ImportMedia();
            //return;
            //Dsg Report generate
            var content = WebRequestProcessor.DownloadHTTPString(DsgUrl);

            var      matches = Regex.Matches(content, baiduRegex, RegexOptions.Multiline | RegexOptions.IgnoreCase);
            Workbook book    = new Workbook();

            book.Open(@"D:\dailyreport\DSG.xlsx");
            var worksheet   = book.Worksheets[0];
            int dsgStartRow = 7;

            foreach (Match match in matches)
            {
                if (match.Groups["PubDate"].Value.Contains("前"))
                {
                    worksheet.Cells.InsertRow(dsgStartRow);
                }
            }


            foreach (Match match in matches)
            {
                if (!match.Groups["PubDate"].Value.Contains("前"))
                {
                    continue;
                }


                var resultUrl = match.Groups["Url"].Value;
                try
                {
                    Uri uri    = new Uri(resultUrl);
                    var domain = GetUrlDomain(uri.Host);
                    //匹配媒体名
                    worksheet.Cells[dsgStartRow, 1].PutValue(domain);
                }
                catch (Exception)
                {
                }
                var title           = TextCleaner.FullClean(match.Groups["Title"].Value) + Environment.NewLine + TextCleaner.FullClean(match.Groups["Text"].Value);
                var currentExcelRow = dsgStartRow + 1;
                worksheet.Cells[dsgStartRow, 0].PutValue(resultUrl);
                worksheet.Cells[dsgStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)";
                worksheet.Cells[dsgStartRow, 6].PutValue(title);

                worksheet.Hyperlinks.Add(dsgStartRow, 6, 1, 1, match.Groups["Url"].Value);
                worksheet.Cells[dsgStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd"));
                worksheet.Cells[dsgStartRow, 8].PutValue("负面舆情");
                dsgStartRow++;
            }

            book.Save(@"D:\dailyreport\DSG.xlsx");

            //Polo Report generate
            Workbook dailybook = new Workbook();

            dailybook.Open(@"D:\dailyreport\日报.xlsx");
            var dailyWorksheet = dailybook.Worksheets[0];
            int dailyStartRow  = 6;

            string categoryName = "大众-POLO";
            var    categoryUrls = poloUrls;

            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-朗逸";
            categoryUrls = langyiUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-途安";
            categoryUrls = turanUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-帕萨特";
            categoryUrls = pasateUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-桑塔纳";
            categoryUrls = santanaUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);

            categoryName = "大众-途观";
            categoryUrls = tuguanUrls;
            CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls);



            MessageBox.Show("抓取完成");
        }

Exemple #14

0

Afficher le fichier

        public static bool ParseItem(string Html, string Pattern, string Url, ref Article BaseArticle)
        {
            //输入检查
            if (string.IsNullOrWhiteSpace(Html) || string.IsNullOrWhiteSpace(Pattern))
            {
                return(false);
            }

            //检查 Pattern 的格式，判断是否符合要求
            XpathPattern xpathPattern = null;

            try
            {
                xpathPattern = JsonConvert.DeserializeObject <XpathPattern>(Pattern);
            }
            catch (Exception ex)
            {
                Logger.Error(string.Format("Pattern 的格式不符合 Xpath Parser 的定义，请检查！Url:{0}, Pattern:{1}.", Url, Pattern), ex);
            }

            HtmlNode itempagenode = HtmlUtility.getSafeHtmlRootNode(Html, true, true);

            //提取文章正文
            if (string.IsNullOrEmpty(BaseArticle.HtmlContent) && !string.IsNullOrWhiteSpace(xpathPattern.ItemContentXPath))
            {
                try
                {
                    BaseArticle.HtmlContent = HTMLCleaner.CleanContent(itempagenode.SelectNodes(xpathPattern.ItemContentXPath), Url, true);
                    BaseArticle.Content     = HTMLCleaner.CleanHTML(BaseArticle.HtmlContent, false);
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析正文出错，Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认标题
            if (string.IsNullOrEmpty(BaseArticle.Title) && !string.IsNullOrWhiteSpace(xpathPattern.ItemTitleXPath))
            {
                try
                {
                    BaseArticle.Title = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemTitleXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错，Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemTitleXPath), ex);
                }
            }

            //确认时间
            if (!string.IsNullOrWhiteSpace(xpathPattern.ItemDateXPath))
            {
                try
                {
                    DateTime Pubdate = DateTimeParser.Parser(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemDateXPath)));

                    if (BaseArticle.PubDate <= DateTime.MinValue.AddYears(1) && Pubdate.Year > 2000) //发布时间过旧
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (BaseArticle.PubDate.Hour == 0 && BaseArticle.PubDate.Minute == 0 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && Pubdate.Year > 2000) //发布时间没有时与分
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (Pubdate.Year > 2000 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && (BaseArticle.PubDate - Pubdate) > new TimeSpan(0, 1, 59) && BaseArticle.PubDate >= DateTime.Now.AddMinutes(-10)) //发布时间拒当前时间很近且相差较大
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错，Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认媒体
            if (string.IsNullOrEmpty(BaseArticle.MediaName) && !string.IsNullOrWhiteSpace(xpathPattern.ItemMediaNameXPath))
            {
                try
                {
                    BaseArticle.MediaName = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemMediaNameXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析媒体出错，Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemMediaNameXPath), ex);
                }
            }

            //确认作者
            if (string.IsNullOrEmpty(BaseArticle.Author) && !string.IsNullOrWhiteSpace(xpathPattern.ItemAuthorXPath))
            {
                try
                {
                    BaseArticle.Author = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemAuthorXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析作者出错，Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemAuthorXPath), ex);
                }
            }

            return(true);
        }

C# (CSharp) TextCleaner.FullClean Exemples