Example #1
0
        private void RunNewTask(DataGridViewCellEventArgs e)
        {
            //开始新的任务
            SetWorkingState(e);
            //SetCrawler();
            kiwiConsole.ClearOutput();
            fileId = 0;
            //tempGridview = dgvTaskCapture;
            master           = SetCrawler();
            kiwiThreadStatus = master.ThreadStatus;
            strExit          = "";
            timer.Start();//20151204暂时注释掉
            //isKillTask = false;
            isWriteTaskOver = false;
            for (int i = 0; i < kiwiThreadStatus.Count(); i++)
            {
                strExit += "true";
            }
            //if (ckbDetail2Mode.Checked)
            //{
            //    isDetailMode2 = true;
            //}
            //else
            //{
            //    isDetailMode2 = false;
            //}

            master.Crawl();
            writeThread = new Thread(WriteToDB);
            writeThread.Start();
        }
Example #2
0
        private static CrawlMaster SetCrawler()
        {
            //SettingDefaultValues();
            //SettingCustomValues();
            var master = new CrawlMaster(Settings);

            master.AddUrlEvent       += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            // master.CustomParseLinkEvent2 += Master_CustomParseLinkEvent2;
            master.CustomParseLinkEvent3 += Master_CustomParseLinkEvent3;
            //master.CustomParseLinkEvent3 += Master_Over;

            return(master);
        }
Example #3
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            filter = new BloomFilter <string>(200000);
            const string CityName = "2";

            // 设置种子地址
            //Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com/#p{0}", CityName));
            Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com"));
            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));
            Settings.HrefKeywords.Add(string.Format("/{0}", "pick"));
            Settings.HrefKeywords.Add(string.Format("/{0}", "news"));

            // 设置爬取线程个数
            Settings.ThreadCount = 1;

            // 设置爬取深度
            Settings.Depth = 7;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;

            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("");
            var master = new CrawlMaster(Settings);

            master.AddUrlEvent       += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.Crawl();

            Console.ReadKey();
        }
Example #4
0
 public SimpleCrawlerTest(string url, ILinkStorage linkStorage)
 {
     _linkStorage = linkStorage ?? new ConsoleLinkStorage();
     Settings.SeedsAddress.Add(url);
     //Settings.ThreadCount = 20;
     Settings.Depth = 5;
     Settings.EscapeLinks.Add(".jpg");
     Settings.EscapeLinks.Add(".gif");
     Settings.EscapeLinks.Add(".png");
     Settings.EscapeLinks.Add(".pdf");
     Settings.EscapeLinks.Add(".doc");
     Settings.EscapeLinks.Add(".xls");
     Settings.AutoSpeedLimit = true;
     _master                    = new CrawlMaster(Settings);
     _master.AddUrlEvent       += MasterAddUrlEvent;
     _master.DataReceivedEvent += MasterDataReceivedEvent;
 }
Example #5
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            filter = new BloomFilter<string>(200000);
            const string CityName = "2";

            // 设置种子地址
            //Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com/#p{0}", CityName));
            Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com"));
            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));
            Settings.HrefKeywords.Add(string.Format("/{0}", "pick"));
            Settings.HrefKeywords.Add(string.Format("/{0}", "news"));

            // 设置爬取线程个数
            Settings.ThreadCount = 1;

            // 设置爬取深度
            Settings.Depth = 7;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;

            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("");
            var master = new CrawlMaster(Settings);
            master.AddUrlEvent += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.Crawl();

            Console.ReadKey();
        }
Example #6
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            filter = new BloomFilter<string>(200000);
            //const string CityName = "beijing";

            // 设置种子地址
            //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName));//
            //Settings.SeedsAddress.Add("http://news.sdau.edu.cn/list.php?pid=3"); sdau
            Settings.SeedsAddress.Add("http://www.shdrc.gov.cn/gcxm/sub1.jsp?lb=001001");
            //Settings.SeedsAddress.Add("   ");
            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));

            // 设置爬取线程个数
            Settings.ThreadCount = 1;

            // 设置爬取深度
            Settings.Depth = 62;//页码数+1

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;

            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            //Settings.RegularFilterExpressions.Add("<a .+ href='(.+)'>下一页</a>");//  string strReg = "<a .+ href='(.+)'>下一页</a>";

            var master = new CrawlMaster(Settings);
            master.AddUrlEvent += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            // master.CustomParseLinkEvent2 += Master_CustomParseLinkEvent2;
            master.CustomParseLinkEvent3 += Master_CustomParseLinkEvent3;
            master.Crawl();

            Console.ReadKey();
        }
Example #7
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            filter = new BloomFilter<string>(200000);

            const string CityName = "beijing";

                // 设置种子地址
                 //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName));
                 // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp"));

            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));

            //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID="));
            //Settings.HrefKeywords.Add(string.Format("result_new"));
            // 设置爬取线程个数
            Settings.ThreadCount = 5;
            // Settings.ThreadCount = 1;
            // 设置爬取深度
            Settings.Depth = 27;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;

            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("");

            //云风Bloginit初始化
            //YunFengBlogInit();
            JGZFBlogInit();
            var master = new CrawlMaster(Settings);
            master.AddUrlEvent += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.Crawl();
            //Console.WriteLine("遍历结束");
            Console.ReadKey();
        }
Example #8
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            if (args.Count() > 0)
            {
                crawlerClassName = args[0];
            }
            if (string.IsNullOrEmpty(crawlerClassName))
            {
                Console.WriteLine("请-classname 设置对应的爬取类");
                crawlerClassName = Console.ReadLine();
                if (string.IsNullOrEmpty(crawlerClassName))
                {
                    return;
                }
            }

            var factoryClassName = string.Format("SimpleCrawler.Demo.{0}", crawlerClassName);

            filter = new BloomFilter <string>(5000000);
            //LandFangUserUpdateCrawler,LandFangCrawler
            //SimpleCrawler.Demo.LandFangUserUpdateCrawler 通过模拟登陆更新*号数据
            //LandFangCityRegionCrawler 获取城市区县市的guidCode对应
            //LandFangCityRegionUpdateCrawler 更新交易状态与区县
            //QiXinEnterpriseCrawler  启信爬取对应 企业与guid
            Console.WriteLine(connStr);
            Console.WriteLine(crawlerClassName);
            Console.WriteLine("确认数据库连接后继续进行");
            simpleCrawler = SimpleCrawlerFactory.Instance.Create(factoryClassName, Settings, filter, dataop);
            //Console.ReadLine();
            //const string CityName = "beijing";
            // 设置种子地址 需要添加布隆过滤种子地址,防止重新2次读取种子地址
            //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName));
            // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp"));
            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));
            //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID="));
            //Settings.HrefKeywords.Add(string.Format("result_new"));
            // 设置爬取线程个数
            //Settings.ThreadCount = 5;
            //Settings.ThreadCount =1;
            // 设置爬取深度
            Settings.Depth = 27;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;
            //是否启用代理
            Settings.CurWebProxy       = GetWebProxy();
            Settings.CurWebProxyString = GetWebProxyString();


            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("http://land.fang.com/market/a0a95a6f-43d4-4b59-a948-d48f21a4e468.html");
            //代理ip模式
            //Settings.IPProxyList = new List<IPProxy>();
            //var ipProxyList = dataop.FindAllByQuery("IPProxy", Query.NE("status", "1")).ToList();
            //Settings.IPProxyList.AddRange(ipProxyList.Select(c => new IPProxy(c.Text("ip"))));
            // Settings.IPProxyList.Add(new IPProxy("31.168.236.236:8080"));
            //云风Bloginit初始化
            // fang99Init();
            // JGZFBlogInit();
            simpleCrawler.SettingInit();
            StartDBChangeProcess();
            var master = new CrawlMaster(Settings);

            master.AddUrlEvent       += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.CrawlErrorEvent   += CrawlErrorEvent;
            master.Crawl();
            // Console.WriteLine("遍历结束");
            if (UrlQueue.Instance.Count > 0)
            {
                Console.ReadKey();
            }
        }
Example #9
0
        private static void Main(string[] args)
        {
            Crawler.Sample.BootStrapper.Startup.Configure();

            _IArticlesService = IocContainer.Default.Resolve <IArticlesService>();

            // 启动日志组件
            log4net.Config.XmlConfigurator.Configure();

            // 启动索引管理器
            IndexManager.Instance.Start();

            /*获取IE浏览器收藏夹中的URL
             * //获取IE浏览器收藏夹中的URL
             * BrowserCollection browserCollection = new BrowserCollection();
             * List<string> urlList = browserCollection.GetBrowserCollectionsUrl();
             */

            List <string> urlList = GetHtmlUrlLink(ReadFile(sourceFile));

            //urlList.Add("http://www.ithao123.cn/content-4285584.html");
            //urlList.Add("http://www.cnblogs.com/yangecnu/p/Introduce-RabbitMQ.html");
            //urlList.Add("http://www.cnblogs.com/Andon_liu/p/5401961.html");
            //urlList.Add("http://www.cnblogs.com/lsjwq/p/5509096.html");
            //urlList.Add("http://www.cnblogs.com/kid-blog/p/4796355.html");
            //urlList.Add("http://www.cnblogs.com/ants/p/5122068.html");
            //urlList.Add("http://www.cnblogs.com/zery/p/5215572.html");
            //urlList.Add("http://www.cnblogs.com/JamesLi2015/p/4744008.html");
            //urlList.Add("http://www.cnblogs.com/kklldog/p/helios_chat_room.html");

            filter = new BloomFilter <string>(200000);

            foreach (var url in urlList)
            {
                var result = _IArticlesService.GetByUrl(url);
                if (url.Length > 0 && !result)
                {
                    Settings.SeedsAddress.Add(string.Format(url));
                }
            }

            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));

            // 设置爬取线程个数
            Settings.ThreadCount = 5;

            // 设置爬取深度
            Settings.Depth = 1;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            //Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;

            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("");
            var master = new CrawlMaster(Settings);

            master.AddUrlEvent       += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.Crawl();

            Console.ReadKey();
        }