private Tweet[] FillUserTweet(UserTweet result, string content) { var matches = Regex.Matches(content, RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase); List <Tweet> tweetList = new List <Tweet>(); try { foreach (Match match in matches) { Tweet tweet = new Tweet(); int comment; int.TryParse(match.Groups["Reply"].Value, out comment); int forward; int.TryParse(match.Groups["Forward"].Value, out forward); tweet.Comment = comment; tweet.Content = TextCleaner.FullClean(match.Groups["Content"].Value); tweet.Mid = match.Groups["Mid"].Value; tweet.Forward = forward; tweet.Source = match.Groups["Source"].Value; tweet.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Url = RegexParser.AbsoluteUrl(match.Groups["Url"].Value, result.Url, true); result.Tweets.Add(tweet); tweetList.Add(tweet); } } catch {} return(tweetList.ToArray()); }
private void CrawlDailyReport(Worksheet dailyWorksheet, Workbook dailybook, ref int dailyStartRow, string categoryName, string[] categoryUrls) { bool isFirst = true; foreach (string url in categoryUrls) { var dailycontent = WebRequestProcessor.DownloadHTTPString(url); Thread.Sleep(2000); var dailyMatches = Regex.Matches(dailycontent, baiduRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match dailyMatch in dailyMatches) { if (!dailyMatch.Groups["PubDate"].Value.Contains("前")) { continue; } if (isFirst) { dailyWorksheet.Cells[dailyStartRow, 2].PutValue(categoryName); isFirst = false; } var resultUrl = dailyMatch.Groups["Url"].Value; try { Uri uri = new Uri(resultUrl); var domain = GetUrlDomain(uri.Host); //匹配媒体名 dailyWorksheet.Cells[dailyStartRow, 1].PutValue(domain); } catch (Exception) { } var title = TextCleaner.FullClean(dailyMatch.Groups["Title"].Value) + Environment.NewLine + TextCleaner.FullClean(dailyMatch.Groups["Text"].Value); var colorstyle = dailyWorksheet.Cells[dailyStartRow, 6].GetDisplayStyle(); colorstyle.Font.Color = Color.Blue; var currentExcelRow = dailyStartRow + 1; dailyWorksheet.Cells[dailyStartRow, 0].PutValue(resultUrl); dailyWorksheet.Cells[dailyStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)"; dailyWorksheet.Cells[dailyStartRow, 6].SetStyle(colorstyle); dailyWorksheet.Cells[dailyStartRow, 6].PutValue(title); dailyWorksheet.Hyperlinks.Add(dailyStartRow, 6, 1, 1, resultUrl); dailyWorksheet.Cells[dailyStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd")); dailyWorksheet.Cells[dailyStartRow, 8].PutValue("负面舆情"); dailyStartRow++; } } dailybook.Save(@"D:\dailyreport\日报.xlsx"); }
/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath)) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } string innerTextValue = ""; try { HtmlNodeNavigator navigator = (HtmlNodeNavigator)BaseNode.CreateNavigator(); var node = navigator.SelectSingleNode(RelXPath); innerTextValue = node.Value; } catch (Exception ex) { } if (string.IsNullOrWhiteSpace(innerTextValue)) { IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() == 0)) { return(null); } innerTextValue = XPathUtility.InnerTextNonDescendants(MatchNodes.First()); } if (CleanConnectionMark) { return(TextCleaner.FullClean(innerTextValue)); } else { return(TextCleaner.FullClean(innerTextValue, true, true, true, false, true, false)); } }
/// <summary> /// Match2s the item. /// </summary> /// <param name="m">M.</param> /// <param name="Item">Item.</param> /// <param name="BaseUrl">Base URL.</param> /// <param name="ItemUrlCaseSensitive">If set to <c>true</c> item URL case sensitive.</param> public static void Match2Item(Match m, ref Article Item, string BaseUrl, bool ItemUrlCaseSensitive = false) { //url Item.Url = new Uri(new Uri(BaseUrl), RegexUtility.TryGetString(m, "Url", Item.Url, false)).AbsoluteUri; //title Item.Title = RegexUtility.TryGetString(m, "Title", Item.Title); //降低Clean级别 if (string.IsNullOrEmpty(Item.Title)) { Item.Title = HTMLCleaner.CleanHTML(Item.Title, true); } //text Item.HtmlContent = RegexUtility.TryGetString(m, "Text", Item.HtmlContent, false); //Author Info Item.Author = RegexUtility.TryGetString(m, "AuthorName", Item.Author); Item.Source = RegexUtility.TryGetString(m, "Source", Item.Source); if (!String.IsNullOrWhiteSpace(Item.Source)) { Item.Source = TextCleaner.FullClean(Item.Source); } //Media Info Item.MediaName = RegexUtility.TryGetString(m, "MediaName", Item.MediaName); //time if (m.Groups["PubDate"].Success) { Item.PubDate = DateTimeParser.Parser(HTMLCleaner.CleanHTML(m.Groups["PubDate"].Value, true)); } if (Item.PubDate <= DateTime.MinValue) { Item.PubDate = DateTime.Now; } Match2ItemCount(m, Item.ViewDataList); }
/// <summary> /// 验证标题是否合法 /// </summary> /// <param name="Title"></param> /// <returns></returns> public bool ValidateTitle(string Title) { if (string.IsNullOrWhiteSpace(Title)) { return(false); } string CleanTitle = TextCleaner.FullClean(Title); switch (Language) { default: case Enums.Language.CHINESE: //中文:标题长度够长,且数字字符占比不超 return((MinLenTitle <= 0 || CleanTitle.Length >= MinLenTitle) && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); case Enums.Language.ENGLISH: //英文:标题单词够多,且数字字符占比不超 return(MinWordCountTitle <= 0 || CleanTitle.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Length > MinWordCountTitle && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); } }
/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath) && postion == 0) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() <= postion)) { return(null); } if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)), true, true, true, false, true, false)); } }
/// <summary> /// Author节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenAuthor) { continue; } double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊人名,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者"); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); //在第一个空格处截断 int SpaceIndex = Text.IndexOf(' '); if (SpaceIndex > 0) { Text = Text.Substring(0, SpaceIndex); } Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查Top10姓氏命中 if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]")) { if (Posibility > 0) { Posibility *= 1.1; } else { Posibility = 0.6; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //ID或Class中出现Reply则为0(只要主贴作者) IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched); if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了 { Posibility = 0; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen : Strategy.List_BestAvgAuthorLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //论坛出现英文字母、数字或下划线,+50%可能性 if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase)) { Posibility *= 1.5; } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } return(AvgPossibility / Nodes.Count()); }
/// <summary> /// 媒体名称节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { double AvgPossibility = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node)); if (Text.Length > Strategy.MaxLenMedia) { continue; } double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0; //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1) { Posibility = 0.2; } //是否是日期时间 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6) { Posibility *= 0.3; } //检查开头 Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex); if (PrefixMatch.Success) { Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' '); Posibility = 1; } //禁用词替换 Text = TextCleaner.RemoveStopWords(Text, StopWords); //检查结尾 if (Regex.IsMatch(Text, @"[报台网刊]")) { if (Posibility > 0) { Posibility = Math.Max(1, Posibility * 1.5); } else { Posibility = 0.8; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore; } //长度检查 if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3) { Posibility /= 3; //超过3倍降权到1/3 } else { //Rate在1-3之间 double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen : Strategy.List_BestAvgMediaLen / (double)Text.Length; //随着Rate提高降低 Posibility *= (0.4 + 0.35 * (3 - Rate)); } //非名称分隔符(包含空格)出现,6折 if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0) { Posibility *= 0.6; } //英文出现,再6折 if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH) { Posibility *= 0.6; } //累加节点的可能性 AvgPossibility += Posibility; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgPossibility *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgPossibility *= 1.2; } //论坛媒体减分 if (Strategy.MediaType == Enums.MediaType.Forum) { AvgPossibility /= 2; } return(AvgPossibility / Nodes.Count()); }
/// <summary> /// ViewReply节点的打分函数(越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <param name="Contain2Numbers">元素中是否包含了2个数字</param> /// <param name="MustBeReply">是否一定是评论(class或内容有明确指示的情况)</param> /// <param name="AvgNumber">平均数,用于比较是View或Reply;如果在一个Element中有两个数字,则为长度2的数组,均值大的是第一个</param> /// <returns></returns> internal static double ViewNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName, out bool Contain2Numbers, out bool MustBeReply, out double[] AvgNumber) { Contain2Numbers = false; MustBeReply = false; AvgNumber = null; //平均字符密度的检查 double AvgCharFreq = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false); if (Text.Length > Strategy.MaxLenView) { continue; } //检查是否命中日期的特殊字符 if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustDate, false) > 0) { return(0); } //检查字符集密度, 不合格直接可能性为0 int CharCount = TextCleaner.CountCharInSet(Text, FrequentCharSet_ViewReply); AvgCharFreq += CharCount / (double)Text.Length; } AvgCharFreq = AvgCharFreq / Nodes.Count(); if (AvgCharFreq < FrequentChars_Min || AvgCharFreq > FrequentChars_Max) { return(0); } //尝试提取其中的连续数字再解析,统计每个Text有几个数字,及其极值分布 int TotalCount = 0; int ParsedNode = 0; List <int> Ints0 = new List <int>(BaseItemCount); List <int> Ints1 = new List <int>(BaseItemCount); int ViewNameMatched = 0; double IDClassScore = 0; foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false); if (Text.Length > Strategy.MaxLenView) { continue; } MatchCollection digiText = Regex.Matches(Text, @"\d{1,9}"); if (digiText.Count == 0) { continue; } //数量超过2则不可能 if (digiText.Count > 2) { return(0); } TotalCount += digiText.Count; //记录每一个Parse出来的int int Val0 = int.Parse(digiText[0].Captures[0].Value); Ints0.Add(Val0); if (digiText.Count > 1) { Ints1.Add(int.Parse(digiText[1].Captures[0].Value)); } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double s = IDClassNameScore(Node, ViewClassNames_Must, ViewClassNames_MustNot, out IDClassNameMatched); if (IDClassNameMatched) { IDClassScore += s; ViewNameMatched++; if (s > 0) //正面命中的话,看看是不是reply { IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched); } if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了 { MustBeReply = true; } } //根据内容关键词来判断是否MustBeReply foreach (string ReplyKeyword in ReplyWords) { if (Text.Contains(ReplyKeyword)) { MustBeReply = true; } } ParsedNode++; } //标志本元素包含两个数字 if (Ints1.Count > 0 && Ints1.Count == Ints0.Count) { AvgNumber = new double[2]; double Sum0 = Ints0.Sum(); double Sum1 = Ints1.Sum(); AvgNumber[0] = (Sum0 > Sum1 ? Sum0 : Sum1) / ParsedNode; AvgNumber[1] = (Sum0 > Sum1 ? Sum1 : Sum0) / ParsedNode; Contain2Numbers = true; } else if (Ints0.Count > 0 && ParsedNode > 0) { AvgNumber = new double[1]; AvgNumber[0] = Ints0.Sum() / ParsedNode; } else { AvgCharFreq = 0; } //如果每个Text只有一个数字,加分20%-50% if (TotalCount == Nodes.Count()) { AvgCharFreq *= (Ints0.Max() <= 31) ? 1.2 : 1.5; } else if (Ints0.Count > 0) { //有两个数字的检查极值 if (Ints0.Max() <= 12 && Ints1.Max() <= 31 || Ints0.Max() <= 31 && Ints1.Max() <= 12) { AvgCharFreq *= 0.8; } else //排除日期可能,得分翻倍 { AvgCharFreq *= 2; } } //根据class和id是否有特殊字样加权 if (ViewNameMatched > 0) { IDClassScore /= ViewNameMatched; AvgCharFreq = (AvgCharFreq == 0 ? 1 : AvgCharFreq) * IDClassScore; } //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性 if (PathLevel > 1) { AvgCharFreq *= (1 - 0.1 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { AvgCharFreq *= 1.2; } return(AvgCharFreq); }
/// <summary> /// 日期节点Pattern的评估函数(考虑时差和数量,可以理解为0-1之间的可能性数值,越大越好) /// </summary> /// <param name="Nodes"></param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount"></param> /// <returns></returns> internal static double DateNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName) { //直接用DateTimeParser.Parser了,所以不用考察字符密度了 double SumDiff = 0; int ParseCount = 0; //所有日期和Now的差距综合 foreach (HtmlNode Node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true); if (Text.Length > Strategy.MaxLenDate) { continue; } DateTime?Val = DateTimeParser.Parser(Text); if (Val != null) { ParseCount++; DateTime d = (DateTime)Val; //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Node, DateClassNames_Must, DateClassNames_MustNot, out IDClassNameMatched); if (d.Hour == 0 && d.Minute == 0 && d.Second == 0) { //只有日期,得分为距离Now的时差 SumDiff += Math.Abs((DateTime.Now - d).TotalDays / (IDClassNameMatched ? IDClassScore : 1)); } else { //有精确时间,则只计入一半的时差 SumDiff += Math.Abs((DateTime.Now - d).TotalDays / 2 / (IDClassNameMatched ? IDClassScore : 1)); } } } double Possibility = 0; //先计算数量的可能性,在1/3-3倍之间都可以,可能性等比递减 if (ParseCount * 3 < BaseItemCount || BaseItemCount * 3 < ParseCount) { return(0); } if (ParseCount >= BaseItemCount) { Possibility = 1 / (ParseCount / BaseItemCount); //倍数的倒数 } else { Possibility = 1 / (BaseItemCount / ParseCount); } //根据平均时差来计算可能性 double AvgDiff = SumDiff / ParseCount; if (AvgDiff > 180) //180以后统一为30% { Possibility *= 0.3; } else if (AvgDiff > 7) //1周以后为30%-100%,等比递减 { Possibility *= (0.3 + 0.004046 * (180 - AvgDiff)); } else if (AvgDiff > 1) //1周以内100%-150% { Possibility *= (1 + 0.08333 * (7 - AvgDiff)); } else //1天以内的,150%-200%,便于区分原帖和评论的时间 { Possibility *= (1.5 + 0.5 * (1 - AvgDiff)); } //微调1:级别约小的越好,2级以上每多一个级别扣20%可能性 if (PathLevel > 1) { Possibility *= (1 - 0.2 * (PathLevel - 1)); } //微调2:用ID和class的好于用序号的,增加20%可能性 if (PathUsingName) { Possibility *= 1.2; } return(Possibility); }
private void FillTweetComment(Tweet tweet, SiteEntity site) { if (tweet.Comment == 0) { return; } int currentPage = 1; string mid = tweet.Mid; try { while (true) { string url = string.Format(CommentUrlFormat, mid, currentPage); var request = BuildRequest(url); CrawlResponse response = null; for (int i = 0; i < 5; i++) { try { response = GeckoRequestProcessor.DoRequest(request, site, null, null); AggrSum(); } catch {} if (response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + response.Url); } else { break; } } CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray())); response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); var pageMatch = Regex.Match(response.Content, RegexCommentPage, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (currentPage != 1 && (!pageMatch.Success || pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture))) { return; } //Fill Tweet var matches = Regex.Matches(response.Content, RegexComment, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match match in matches) { Comment comment = new Comment(); comment.Author = match.Groups["Author"].Value; comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true); comment.Content = TextCleaner.FullClean(match.Groups["Content"].Value); comment.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Comments.Add(comment); } currentPage++; } } catch { } }
public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature) { if (Nodes == null || ItemCount == 0) { return(null); } Feature feature = new Feature(0); feature.FigureFeatures["ItemCount"] = Nodes.Count(); int[] TextLen = new int[Nodes.Count()]; int[] DigiLen = new int[Nodes.Count()]; double[] Diff = new double[Nodes.Count()]; int i = 0; int[] intone = new int[Nodes.Count()]; bool havetwonums = true; int DigitCount = 0; foreach (HtmlNode node in Nodes) { string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node)); TextLen[i] = Text.Length; DigiLen[i] = TextCleaner.CountDigitChars(Text); if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1) { if (Text.Contains("秒前") && Text.Length < 5) { Text = "昨日"; } DateTime?Val = DateTimeParser.Parser(Text); if (Val != null) { double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays); if (diff < 4096 && Text.Length < Threshold.MaxDateLength) { Diff[i] = diff; feature.FigureFeatures["DateParseCount"] += 1; } } } string Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false); MatchCollection digiText = Regex.Matches(Textfordigit, @"\d{1,9}"); switch (digiText.Count) { case 1: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value); havetwonums = false; break; case 2: DigitCount++; intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value); break; default: havetwonums = false; break; } if (TextLen.Sum() < 50) { feature = CheckforChars(feature, Text, stencilfeature, false); } i += 1; } //ID和CLASS NAME的识别 feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false); if (stencilfeature.FigureFeatures["DigitCountRate"] == 1) { feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount; } if (stencilfeature.FigureFeatures["AvgTextLen"] == 1) { feature.FigureFeatures["AvgTextLen"] = TextLen.Average(); } if (stencilfeature.FigureFeatures["AllTextLen"] == 1) { feature.FigureFeatures["AllTextLen"] = TextLen.Sum(); } if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1) { feature.FigureFeatures["AvgDateDistance"] = Diff.Average(); } intone = intone.Where(inton => inton > 0).ToArray(); if (stencilfeature.FigureFeatures["AvgNumber"] == 1) { feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2); } if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0) { feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount; } if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1) { feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum()); } if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1) { feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1; } //曾经考虑过把数字特征的方差也统计进来,或者把标准差与平均值之比放进来。有用吗 return(feature); }
private void CrawlBtn_Click(object sender, EventArgs e) { //ImportMedia(); //return; //Dsg Report generate var content = WebRequestProcessor.DownloadHTTPString(DsgUrl); var matches = Regex.Matches(content, baiduRegex, RegexOptions.Multiline | RegexOptions.IgnoreCase); Workbook book = new Workbook(); book.Open(@"D:\dailyreport\DSG.xlsx"); var worksheet = book.Worksheets[0]; int dsgStartRow = 7; foreach (Match match in matches) { if (match.Groups["PubDate"].Value.Contains("前")) { worksheet.Cells.InsertRow(dsgStartRow); } } foreach (Match match in matches) { if (!match.Groups["PubDate"].Value.Contains("前")) { continue; } var resultUrl = match.Groups["Url"].Value; try { Uri uri = new Uri(resultUrl); var domain = GetUrlDomain(uri.Host); //匹配媒体名 worksheet.Cells[dsgStartRow, 1].PutValue(domain); } catch (Exception) { } var title = TextCleaner.FullClean(match.Groups["Title"].Value) + Environment.NewLine + TextCleaner.FullClean(match.Groups["Text"].Value); var currentExcelRow = dsgStartRow + 1; worksheet.Cells[dsgStartRow, 0].PutValue(resultUrl); worksheet.Cells[dsgStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)"; worksheet.Cells[dsgStartRow, 6].PutValue(title); worksheet.Hyperlinks.Add(dsgStartRow, 6, 1, 1, match.Groups["Url"].Value); worksheet.Cells[dsgStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd")); worksheet.Cells[dsgStartRow, 8].PutValue("负面舆情"); dsgStartRow++; } book.Save(@"D:\dailyreport\DSG.xlsx"); //Polo Report generate Workbook dailybook = new Workbook(); dailybook.Open(@"D:\dailyreport\日报.xlsx"); var dailyWorksheet = dailybook.Worksheets[0]; int dailyStartRow = 6; string categoryName = "大众-POLO"; var categoryUrls = poloUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); categoryName = "大众-朗逸"; categoryUrls = langyiUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); categoryName = "大众-途安"; categoryUrls = turanUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); categoryName = "大众-帕萨特"; categoryUrls = pasateUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); categoryName = "大众-桑塔纳"; categoryUrls = santanaUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); categoryName = "大众-途观"; categoryUrls = tuguanUrls; CrawlDailyReport(dailyWorksheet, dailybook, ref dailyStartRow, categoryName, categoryUrls); MessageBox.Show("抓取完成"); }
public static bool ParseItem(string Html, string Pattern, string Url, ref Article BaseArticle) { //输入检查 if (string.IsNullOrWhiteSpace(Html) || string.IsNullOrWhiteSpace(Pattern)) { return(false); } //检查 Pattern 的格式,判断是否符合要求 XpathPattern xpathPattern = null; try { xpathPattern = JsonConvert.DeserializeObject <XpathPattern>(Pattern); } catch (Exception ex) { Logger.Error(string.Format("Pattern 的格式不符合 Xpath Parser 的定义,请检查!Url:{0}, Pattern:{1}.", Url, Pattern), ex); } HtmlNode itempagenode = HtmlUtility.getSafeHtmlRootNode(Html, true, true); //提取文章正文 if (string.IsNullOrEmpty(BaseArticle.HtmlContent) && !string.IsNullOrWhiteSpace(xpathPattern.ItemContentXPath)) { try { BaseArticle.HtmlContent = HTMLCleaner.CleanContent(itempagenode.SelectNodes(xpathPattern.ItemContentXPath), Url, true); BaseArticle.Content = HTMLCleaner.CleanHTML(BaseArticle.HtmlContent, false); } catch (Exception ex) { Logger.Error(string.Format("从详情页解析正文出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex); } } //确认标题 if (string.IsNullOrEmpty(BaseArticle.Title) && !string.IsNullOrWhiteSpace(xpathPattern.ItemTitleXPath)) { try { BaseArticle.Title = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemTitleXPath))); } catch (Exception ex) { Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemTitleXPath), ex); } } //确认时间 if (!string.IsNullOrWhiteSpace(xpathPattern.ItemDateXPath)) { try { DateTime Pubdate = DateTimeParser.Parser(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemDateXPath))); if (BaseArticle.PubDate <= DateTime.MinValue.AddYears(1) && Pubdate.Year > 2000) //发布时间过旧 { BaseArticle.PubDate = Pubdate; } else if (BaseArticle.PubDate.Hour == 0 && BaseArticle.PubDate.Minute == 0 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && Pubdate.Year > 2000) //发布时间没有时与分 { BaseArticle.PubDate = Pubdate; } else if (Pubdate.Year > 2000 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && (BaseArticle.PubDate - Pubdate) > new TimeSpan(0, 1, 59) && BaseArticle.PubDate >= DateTime.Now.AddMinutes(-10)) //发布时间拒当前时间很近且相差较大 { BaseArticle.PubDate = Pubdate; } } catch (Exception ex) { Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex); } } //确认媒体 if (string.IsNullOrEmpty(BaseArticle.MediaName) && !string.IsNullOrWhiteSpace(xpathPattern.ItemMediaNameXPath)) { try { BaseArticle.MediaName = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemMediaNameXPath))); } catch (Exception ex) { Logger.Error(string.Format("从详情页解析媒体出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemMediaNameXPath), ex); } } //确认作者 if (string.IsNullOrEmpty(BaseArticle.Author) && !string.IsNullOrWhiteSpace(xpathPattern.ItemAuthorXPath)) { try { BaseArticle.Author = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemAuthorXPath))); } catch (Exception ex) { Logger.Error(string.Format("从详情页解析作者出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemAuthorXPath), ex); } } return(true); }