예제 #1
0
        private static void Master_CustomParseLinkEvent3(CustomParseLinkEvent3Args args)
        {
            #region
            //
            #region 可以进一步修改
            //if (isDetailMode2 == true)
            //{
            //    CustomParseLink_MainList(args, "今天天气好晴朗,又是刮风又是下雨。");//什么都不匹配
            //    CustomParseLink_MainListMode2(args, configModel.kDetailPattern, 0);
            //}
            //else
            //{
            //    CustomParseLink_MainList(args, configModel.kDetailPattern);//什么都不匹配
            //}
            #endregion
            //
            CustomParseLink_MainList(args, "什么都不匹配什么都不匹配什么都不匹配什么都不匹配");          //什么都不匹配
            CustomParseLink_MainListMode2(args, configModel.kDetailPattern, 0);
            CustomParseLink_NextPageSdau(args, configModel.kNextPagePattern, 1); //下一页
            #endregion

            #region  SDAU
            //CustomParseLink_MainList(args, "(view).+?([0-9]{5})");//去除,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_NextPageSdau(args, "<a .+ href='(.+)'>下一页</a>", 1);//添加,下一步,拼写一个大的正则表达式就好
            #endregion
            #region 北京市地震局
            //CustomParseLink_MainList(args, "今天天气好晴朗,又是刮风又是下雨");//什么都不匹配
            //CustomParseLink_NextPageSdau(args, "•<A href=\"(/manage/html/[\\d\\w]{32}/_content/\\d{2}_\\d{2}/\\d{2}/\\d+\\.html)\"", 1);//详细页
            //CustomParseLink_NextPageSdau(args, "<a href=\"(index_\\d+.html)\">下一页</a>", 1);//下一页
            #endregion
            #region  海民政
            ////去除(保留符合正则的),下一步,拼写一个大的正则表达式就好
            //CustomParseLink_MainList(args, @"/gb/shmzj/node4/node\d+/n\d{4}/u1ai\d{5}.html");
            ////添加,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_NextPageSdau(args, "<a HREF=\"(/gb\\shmzj/node4/node\\d+/n\\d{4}/index\\d+\\.html)\" class=next>下一页</a>", 1); //<a href="(List.action\?[\w\d&=]+)">下一页</a>
            #endregion
            #region 陕西

            ////去除,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_MainList(args, @"xg-xxgk-gk-[\d|-]+");//xg-xxgk-gk-[\d|-]+
            ////添加,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_NextPageSdau(args, "<a href=\"(List.action\\?[\\w\\d&=]+)\">下一页</a>", 1); //<a href="(List.action\?[\w\d&=]+)">下一页</a>
            #endregion
            #region  海
            ////去除,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_MainList(args, @"detail1.jsp.*id=\d*");
            ////添加,下一步,拼写一个大的正则表达式就好
            //CustomParseLink_NextPageSdau(args, @"<A .*HREF=(.+) class.*>\s*下一页</A>", 1);
            #endregion
            #region 安居客
            //CustomParseLink_MainList(args, "http://beijing.anjuke.com/prop/view/.*commsearch_p");
            //CustomParseLink_NextPageSdau(args, "<a href='(.+)' class='aNxt'>下一页 &gt;</a>", 1);
            //CustomParseLink_NextPageSdau(args, "http://beijing.anjuke.com/prop/view/.*commsearch_p", 0);
            #endregion
        }
예제 #2
0
        /// <summary>
        /// 处理UrlDictionary,筛选+做的是减法
        /// </summary>
        /// <param name="args"></param>
        /// <param name="patternStr"></param>
        private static void CustomParseLink_MainList(CustomParseLinkEvent3Args args, string patternStr)
        {
            Dictionary <string, string> temp = new Dictionary <string, string>();

            foreach (var item in args.UrlDictionary)
            {
                string href = item.Key;
                string text = item.Value;

                if (!string.IsNullOrEmpty(href))
                {
                    Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写
                    Match mat   = regex.Match(href);
                    if (mat.Success)
                    {
                        temp.Add(href, text);
                    }
                }
            }
            args.UrlDictionary = temp;
        }
예제 #3
0
        /// <summary>
        /// 处理Html,重新过滤+做的是加法
        /// </summary>
        /// <param name="args"></param>
        /// <param name="patternStr"></param>
        /// <param name="groupIndex"></param>
        private static void CustomParseLink_NextPageSdau(CustomParseLinkEvent3Args args, string patternStr, int groupIndex)
        {
            string url = "";

            if (args != null && !string.IsNullOrEmpty(args.Html))
            {
                Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写
                Match mat   = regex.Match(args.Html);
                if (mat.Success)
                {
                    url = mat.Groups[groupIndex].Value;
                    var baseUri    = new Uri(args.UrlInfo.UrlString);
                    Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                         ? new Uri(url)
                                         : new Uri(baseUri, url);//根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。
                    //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径
                    url = currentUri.AbsoluteUri;
                    //Console.WriteLine("######" + url + "######");
                    args.UrlDictionary.Add(url, Guid.NewGuid().ToString());
                }
            }
            //return args.UrlDictionary;
        }
예제 #4
0
        private static void CustomParseLink_MainListMode2(CustomParseLinkEvent3Args args, string kDetailPattern, int groupIndex)
        {
            string url = "";

            if (args != null && !string.IsNullOrEmpty(args.Html))
            {
                MatchCollection mat_k = Regex.Matches(args.Html, kDetailPattern, RegexOptions.IgnoreCase);
                foreach (Match item in mat_k)
                {
                    if (item.Success)
                    {
                        url = item.Groups[groupIndex].Value;
                        var baseUri    = new Uri(args.UrlInfo.UrlString);
                        Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                             ? new Uri(url)
                                             : new Uri(baseUri, url); //根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。
                                                                      //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径
                        url = currentUri.AbsoluteUri;
                        //Console.WriteLine("######" + url + "######");
                        args.UrlDictionary.Add(url, Guid.NewGuid().ToString());
                    }
                }
            }
        }
예제 #5
0
 private static void Master_CustomParseLinkEvent3(CustomParseLinkEvent3Args args)
 {
     /*sdau
     CustomParseLink_MainList(args, "(view).+?([0-9]{5})");//去除,下一步,拼写一个大的正则表达式就好
     CustomParseLink_NextPageSdau(args, "<a .+ href='(.+)'>下一页</a>", 1);//添加,下一步,拼写一个大的正则表达式就好
     */
     CustomParseLink_MainList(args, @"detail1.jsp.*id=\d*");//去除,下一步,拼写一个大的正则表达式就好
     CustomParseLink_NextPageSdau(args, @"<A .*HREF=(.+) class.*>\s*下一页</A>", 1);//添加,下一步,拼写一个大的正则表达式就好
 }
예제 #6
0
 /// <summary>
 /// 处理Html
 /// </summary>
 /// <param name="args"></param>
 /// <param name="patternStr"></param>
 /// <param name="groupIndex"></param>
 private static void CustomParseLink_NextPageSdau(CustomParseLinkEvent3Args args, string patternStr, int groupIndex)
 {
     string url = "";
     if (args != null && !string.IsNullOrEmpty(args.Html))
     {
         Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写
         Match mat = regex.Match(args.Html);
         if (mat.Success)
         {
             url = mat.Groups[groupIndex].Value;
             var baseUri = new Uri(args.UrlInfo.UrlString);
             Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                  ? new Uri(url)
                                  : new Uri(baseUri, url);//根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。
                                                          //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径
             url = currentUri.AbsoluteUri;
             //Console.WriteLine("######" + url + "######");
             args.UrlDictionary.Add(url, Guid.NewGuid().ToString());
         }
     }
     //return args.UrlDictionary;
 }
예제 #7
0
        /// <summary>
        /// 处理UrlDictionary
        /// </summary>
        /// <param name="args"></param>
        /// <param name="patternStr"></param>
        private static void CustomParseLink_MainList(CustomParseLinkEvent3Args args, string patternStr)
        {
            Dictionary<string, string> temp = new Dictionary<string, string>();
            foreach (var item in args.UrlDictionary)
            {
                string href = item.Key;
                string text = item.Value;

                if (!string.IsNullOrEmpty(href))
                {
                    Regex regex = new Regex(patternStr, RegexOptions.IgnoreCase);//忽略大小写
                    Match mat = regex.Match(href);
                    if (mat.Success)
                    {
                        temp.Add(href, text);
                    }
                }
            }
            args.UrlDictionary = temp;
        }