private static void Master_CustomParseLinkEvent3(CustomParseLinkEvent3Args args) { #region // #region 可以进一步修改 //if (isDetailMode2 == true) //{ // CustomParseLink_MainList(args, "今天天气好晴朗,又是刮风又是下雨。");//什么都不匹配 // CustomParseLink_MainListMode2(args, configModel.kDetailPattern, 0); //} //else //{ // CustomParseLink_MainList(args, configModel.kDetailPattern);//什么都不匹配 //} #endregion // CustomParseLink_MainList(args, "什么都不匹配什么都不匹配什么都不匹配什么都不匹配"); //什么都不匹配 CustomParseLink_MainListMode2(args, configModel.kDetailPattern, 0); CustomParseLink_NextPageSdau(args, configModel.kNextPagePattern, 1); //下一页 #endregion #region SDAU //CustomParseLink_MainList(args, "(view).+?([0-9]{5})");//去除,下一步,拼写一个大的正则表达式就好 //CustomParseLink_NextPageSdau(args, "<a .+ href='(.+)'>下一页</a>", 1);//添加,下一步,拼写一个大的正则表达式就好 #endregion #region 北京市地震局 //CustomParseLink_MainList(args, "今天天气好晴朗,又是刮风又是下雨");//什么都不匹配 //CustomParseLink_NextPageSdau(args, "•<A href=\"(/manage/html/[\\d\\w]{32}/_content/\\d{2}_\\d{2}/\\d{2}/\\d+\\.html)\"", 1);//详细页 //CustomParseLink_NextPageSdau(args, "<a href=\"(index_\\d+.html)\">下一页</a>", 1);//下一页 #endregion #region 海民政 ////去除(保留符合正则的),下一步,拼写一个大的正则表达式就好 //CustomParseLink_MainList(args, @"/gb/shmzj/node4/node\d+/n\d{4}/u1ai\d{5}.html"); ////添加,下一步,拼写一个大的正则表达式就好 //CustomParseLink_NextPageSdau(args, "<a HREF=\"(/gb\\shmzj/node4/node\\d+/n\\d{4}/index\\d+\\.html)\" class=next>下一页</a>", 1); //<a href="(List.action\?[\w\d&=]+)">下一页</a> #endregion #region 陕西 ////去除,下一步,拼写一个大的正则表达式就好 //CustomParseLink_MainList(args, @"xg-xxgk-gk-[\d|-]+");//xg-xxgk-gk-[\d|-]+ ////添加,下一步,拼写一个大的正则表达式就好 //CustomParseLink_NextPageSdau(args, "<a href=\"(List.action\\?[\\w\\d&=]+)\">下一页</a>", 1); //<a href="(List.action\?[\w\d&=]+)">下一页</a> #endregion #region 海 ////去除,下一步,拼写一个大的正则表达式就好 //CustomParseLink_MainList(args, @"detail1.jsp.*id=\d*"); ////添加,下一步,拼写一个大的正则表达式就好 //CustomParseLink_NextPageSdau(args, @"<A .*HREF=(.+) class.*>\s*下一页</A>", 1); #endregion #region 安居客 //CustomParseLink_MainList(args, "http://beijing.anjuke.com/prop/view/.*commsearch_p"); //CustomParseLink_NextPageSdau(args, "<a href='(.+)' class='aNxt'>下一页 ></a>", 1); //CustomParseLink_NextPageSdau(args, "http://beijing.anjuke.com/prop/view/.*commsearch_p", 0); #endregion }
/// <summary> /// 处理UrlDictionary,筛选+做的是减法 /// </summary> /// <param name="args"></param> /// <param name="patternStr"></param> private static void CustomParseLink_MainList(CustomParseLinkEvent3Args args, string patternStr) { Dictionary <string, string> temp = new Dictionary <string, string>(); foreach (var item in args.UrlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写 Match mat = regex.Match(href); if (mat.Success) { temp.Add(href, text); } } } args.UrlDictionary = temp; }
/// <summary> /// 处理Html,重新过滤+做的是加法 /// </summary> /// <param name="args"></param> /// <param name="patternStr"></param> /// <param name="groupIndex"></param> private static void CustomParseLink_NextPageSdau(CustomParseLinkEvent3Args args, string patternStr, int groupIndex) { string url = ""; if (args != null && !string.IsNullOrEmpty(args.Html)) { Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写 Match mat = regex.Match(args.Html); if (mat.Success) { url = mat.Groups[groupIndex].Value; var baseUri = new Uri(args.UrlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url);//根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。 //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径 url = currentUri.AbsoluteUri; //Console.WriteLine("######" + url + "######"); args.UrlDictionary.Add(url, Guid.NewGuid().ToString()); } } //return args.UrlDictionary; }
private static void CustomParseLink_MainListMode2(CustomParseLinkEvent3Args args, string kDetailPattern, int groupIndex) { string url = ""; if (args != null && !string.IsNullOrEmpty(args.Html)) { MatchCollection mat_k = Regex.Matches(args.Html, kDetailPattern, RegexOptions.IgnoreCase); foreach (Match item in mat_k) { if (item.Success) { url = item.Groups[groupIndex].Value; var baseUri = new Uri(args.UrlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); //根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。 //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径 url = currentUri.AbsoluteUri; //Console.WriteLine("######" + url + "######"); args.UrlDictionary.Add(url, Guid.NewGuid().ToString()); } } } }
private static void Master_CustomParseLinkEvent3(CustomParseLinkEvent3Args args) { /*sdau CustomParseLink_MainList(args, "(view).+?([0-9]{5})");//去除,下一步,拼写一个大的正则表达式就好 CustomParseLink_NextPageSdau(args, "<a .+ href='(.+)'>下一页</a>", 1);//添加,下一步,拼写一个大的正则表达式就好 */ CustomParseLink_MainList(args, @"detail1.jsp.*id=\d*");//去除,下一步,拼写一个大的正则表达式就好 CustomParseLink_NextPageSdau(args, @"<A .*HREF=(.+) class.*>\s*下一页</A>", 1);//添加,下一步,拼写一个大的正则表达式就好 }
/// <summary> /// 处理Html /// </summary> /// <param name="args"></param> /// <param name="patternStr"></param> /// <param name="groupIndex"></param> private static void CustomParseLink_NextPageSdau(CustomParseLinkEvent3Args args, string patternStr, int groupIndex) { string url = ""; if (args != null && !string.IsNullOrEmpty(args.Html)) { Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写 Match mat = regex.Match(args.Html); if (mat.Success) { url = mat.Groups[groupIndex].Value; var baseUri = new Uri(args.UrlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url);//根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。 //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径 url = currentUri.AbsoluteUri; //Console.WriteLine("######" + url + "######"); args.UrlDictionary.Add(url, Guid.NewGuid().ToString()); } } //return args.UrlDictionary; }
/// <summary> /// 处理UrlDictionary /// </summary> /// <param name="args"></param> /// <param name="patternStr"></param> private static void CustomParseLink_MainList(CustomParseLinkEvent3Args args, string patternStr) { Dictionary<string, string> temp = new Dictionary<string, string>(); foreach (var item in args.UrlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写 Match mat = regex.Match(href); if (mat.Success) { temp.Add(href, text); } } } args.UrlDictionary = temp; }