private void RunNewTask(DataGridViewCellEventArgs e) { //开始新的任务 SetWorkingState(e); //SetCrawler(); kiwiConsole.ClearOutput(); fileId = 0; //tempGridview = dgvTaskCapture; master = SetCrawler(); kiwiThreadStatus = master.ThreadStatus; strExit = ""; timer.Start();//20151204暂时注释掉 //isKillTask = false; isWriteTaskOver = false; for (int i = 0; i < kiwiThreadStatus.Count(); i++) { strExit += "true"; } //if (ckbDetail2Mode.Checked) //{ // isDetailMode2 = true; //} //else //{ // isDetailMode2 = false; //} master.Crawl(); writeThread = new Thread(WriteToDB); writeThread.Start(); }
static void Main(string[] args) { //UrlInfo urlinfo = new UrlInfo("http://www.0ddt.com/web.rar"); //HttpHandle.HttpResult httpResult = HttpHandle.Get(urlinfo); //StopProcess("ScanApp");//清理进程 //StopProcess("ScanApp.vshost");//清理进程 //创建Bloom 算法 filter = new BloomFilter <string>(200000); //添加爬虫排除列表 UrlDebar = ReadTxtContent(path + "\\Dictionary"); //添加爬虫字典后缀 UrlSuffix = ReadTxtContent(path + "\\Dictionary\\UrlSuffix"); //获取起始主Url string MianUrl = ConfigurationManager.ConnectionStrings["MianUrl"].ToString(); //获取配置文件的线程数 int ThreadCount = Convert.ToInt32(ConfigurationManager.ConnectionStrings["ThreadCount"].ToString()); //获取添加地址 master.AddUrl(MianUrl); //设置线程数 master.ThreadCount = ThreadCount; master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); Console.ReadKey(true); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { filter = new BloomFilter <string>(200000); const string CityName = "2"; // 设置种子地址 //Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com/#p{0}", CityName)); Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com")); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); Settings.HrefKeywords.Add(string.Format("/{0}", "pick")); Settings.HrefKeywords.Add(string.Format("/{0}", "news")); // 设置爬取线程个数 Settings.ThreadCount = 1; // 设置爬取深度 Settings.Depth = 7; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add(""); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); Console.ReadKey(); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { filter = new BloomFilter<string>(200000); const string CityName = "2"; // 设置种子地址 //Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com/#p{0}", CityName)); Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com")); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); Settings.HrefKeywords.Add(string.Format("/{0}", "pick")); Settings.HrefKeywords.Add(string.Format("/{0}", "news")); // 设置爬取线程个数 Settings.ThreadCount = 1; // 设置爬取深度 Settings.Depth = 7; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add(""); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); Console.ReadKey(); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { filter = new BloomFilter<string>(200000); //const string CityName = "beijing"; // 设置种子地址 //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName));// //Settings.SeedsAddress.Add("http://news.sdau.edu.cn/list.php?pid=3"); sdau Settings.SeedsAddress.Add("http://www.shdrc.gov.cn/gcxm/sub1.jsp?lb=001001"); //Settings.SeedsAddress.Add(" "); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); // 设置爬取线程个数 Settings.ThreadCount = 1; // 设置爬取深度 Settings.Depth = 62;//页码数+1 // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 //Settings.RegularFilterExpressions.Add("<a .+ href='(.+)'>下一页</a>");// string strReg = "<a .+ href='(.+)'>下一页</a>"; var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; // master.CustomParseLinkEvent2 += Master_CustomParseLinkEvent2; master.CustomParseLinkEvent3 += Master_CustomParseLinkEvent3; master.Crawl(); Console.ReadKey(); }
public void Execute() { _master.Crawl(); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { filter = new BloomFilter<string>(200000); const string CityName = "beijing"; // 设置种子地址 //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName)); // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp")); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID=")); //Settings.HrefKeywords.Add(string.Format("result_new")); // 设置爬取线程个数 Settings.ThreadCount = 5; // Settings.ThreadCount = 1; // 设置爬取深度 Settings.Depth = 27; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add(""); //云风Bloginit初始化 //YunFengBlogInit(); JGZFBlogInit(); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); //Console.WriteLine("遍历结束"); Console.ReadKey(); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { if (args.Count() > 0) { crawlerClassName = args[0]; } if (string.IsNullOrEmpty(crawlerClassName)) { Console.WriteLine("请-classname 设置对应的爬取类"); crawlerClassName = Console.ReadLine(); if (string.IsNullOrEmpty(crawlerClassName)) { return; } } var factoryClassName = string.Format("SimpleCrawler.Demo.{0}", crawlerClassName); filter = new BloomFilter <string>(5000000); //LandFangUserUpdateCrawler,LandFangCrawler //SimpleCrawler.Demo.LandFangUserUpdateCrawler 通过模拟登陆更新*号数据 //LandFangCityRegionCrawler 获取城市区县市的guidCode对应 //LandFangCityRegionUpdateCrawler 更新交易状态与区县 //QiXinEnterpriseCrawler 启信爬取对应 企业与guid Console.WriteLine(connStr); Console.WriteLine(crawlerClassName); Console.WriteLine("确认数据库连接后继续进行"); simpleCrawler = SimpleCrawlerFactory.Instance.Create(factoryClassName, Settings, filter, dataop); //Console.ReadLine(); //const string CityName = "beijing"; // 设置种子地址 需要添加布隆过滤种子地址,防止重新2次读取种子地址 //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName)); // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp")); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID=")); //Settings.HrefKeywords.Add(string.Format("result_new")); // 设置爬取线程个数 //Settings.ThreadCount = 5; //Settings.ThreadCount =1; // 设置爬取深度 Settings.Depth = 27; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; //是否启用代理 Settings.CurWebProxy = GetWebProxy(); Settings.CurWebProxyString = GetWebProxyString(); // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add("http://land.fang.com/market/a0a95a6f-43d4-4b59-a948-d48f21a4e468.html"); //代理ip模式 //Settings.IPProxyList = new List<IPProxy>(); //var ipProxyList = dataop.FindAllByQuery("IPProxy", Query.NE("status", "1")).ToList(); //Settings.IPProxyList.AddRange(ipProxyList.Select(c => new IPProxy(c.Text("ip")))); // Settings.IPProxyList.Add(new IPProxy("31.168.236.236:8080")); //云风Bloginit初始化 // fang99Init(); // JGZFBlogInit(); simpleCrawler.SettingInit(); StartDBChangeProcess(); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.CrawlErrorEvent += CrawlErrorEvent; master.Crawl(); // Console.WriteLine("遍历结束"); if (UrlQueue.Instance.Count > 0) { Console.ReadKey(); } }
private static void Main(string[] args) { Crawler.Sample.BootStrapper.Startup.Configure(); _IArticlesService = IocContainer.Default.Resolve <IArticlesService>(); // 启动日志组件 log4net.Config.XmlConfigurator.Configure(); // 启动索引管理器 IndexManager.Instance.Start(); /*获取IE浏览器收藏夹中的URL * //获取IE浏览器收藏夹中的URL * BrowserCollection browserCollection = new BrowserCollection(); * List<string> urlList = browserCollection.GetBrowserCollectionsUrl(); */ List <string> urlList = GetHtmlUrlLink(ReadFile(sourceFile)); //urlList.Add("http://www.ithao123.cn/content-4285584.html"); //urlList.Add("http://www.cnblogs.com/yangecnu/p/Introduce-RabbitMQ.html"); //urlList.Add("http://www.cnblogs.com/Andon_liu/p/5401961.html"); //urlList.Add("http://www.cnblogs.com/lsjwq/p/5509096.html"); //urlList.Add("http://www.cnblogs.com/kid-blog/p/4796355.html"); //urlList.Add("http://www.cnblogs.com/ants/p/5122068.html"); //urlList.Add("http://www.cnblogs.com/zery/p/5215572.html"); //urlList.Add("http://www.cnblogs.com/JamesLi2015/p/4744008.html"); //urlList.Add("http://www.cnblogs.com/kklldog/p/helios_chat_room.html"); filter = new BloomFilter <string>(200000); foreach (var url in urlList) { var result = _IArticlesService.GetByUrl(url); if (url.Length > 0 && !result) { Settings.SeedsAddress.Add(string.Format(url)); } } // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); // 设置爬取线程个数 Settings.ThreadCount = 5; // 设置爬取深度 Settings.Depth = 1; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 //Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add(""); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.Crawl(); Console.ReadKey(); }