private static void Main(string[] args) { //解析命令行参数 ICommandLineParser parser = new CommandLineParser(new CommandLineParserSettings(Console.Error)); if (!parser.ParseArguments(args, _opts)) Environment.Exit(1); _root_node = new node { url = _opts._url_start }; _dbm = new db_mgr(_opts._db_path, _opts._db_cache); //启动工作线程 _thread_msg_dic = new Dictionary<Thread, string>(_opts._thread_cnt); for (int i = 0; i != _opts._thread_cnt; ++i) { Thread t = new Thread(t_work) { IsBackground = true, Name = "t" + i }; lock (_thread_msg_dic) _thread_msg_dic.Add(t, "started"); t.Start(); } Console.Title = _heading_info; while (true) { Console.Clear(); Console.WriteLine(_opts.create_helptext(_heading_info)); Console.WriteLine(); Process proc_self = Process.GetCurrentProcess(); Console.WriteLine("working time: {0}", (DateTime.Now - proc_self.StartTime)); Console.WriteLine("mem usage: {0} KB", (proc_self.WorkingSet64 / 1024).ToString("###,###")); Console.WriteLine("res downloaded:{0}\tres stored:{1}", prog_stat.res_downloaded, prog_stat.res_stored); Console.WriteLine("threads:"); lock (_thread_msg_dic) { foreach (KeyValuePair<Thread, string> kv in _thread_msg_dic) Console.WriteLine(kv.Key.Name + "::" + kv.Value); } Thread.Sleep(1000); } }
/// <summary> /// 获取资源, 构建和遍历和url树. /// </summary> /// <param name="start_node">起始节点</param> private static void crawl(ref node start_node) { //锁定node lock (start_node) { if (start_node.locked) return; start_node.locked = true; } lock (_log) _log.InfoFormat("requesting\t{0}", start_node.url); { string res = get_res_text(start_node.url); if (res == null) { //str容器未得到设置, 标记node无效. start_node.valid = false; return; } string title = null; Match m = _regex_title.Match(res); //获取html中<title>内容 if (m.Success) title = m.Groups[1].Value.Replace("\n", "").Replace("\r", "").Trim(); //写入数据库 _dbm.write_to_db(start_node.url, title, res, DateTime.Now); ++prog_stat.res_stored; //当前爬行深度已达到限制, 不再增加child. if (start_node.depth >= _opts._crawl_depth) return; //匹配html中的href,将这些url作为自己的children集合. MatchCollection mc = _regex_href.Matches(res); IEnumerable<string> mc_str = (mc.AsParallel() as IEnumerable<Object>).Select(f => (f as Match).Groups[1].Value).Distinct(); foreach (string href in mc_str) { string child_url = filter.filter_href(href, start_node.url); if (child_url == null) continue; node child = new node { url = child_url, parent = start_node, depth = start_node.depth + 1 }; //深度+1 if (isdup(child_url)) continue; //url重复检查. _visited_url.Add(child_url.GetHashCode(), null); lock (start_node) start_node.children.Add(child); //添加children,这些新的children是未lock状态, 会马上被空闲的线程抢到并遍历. } }//释放res //遍历child for (int i = 0; i != start_node.children.Count; ++i) { node child = start_node.children[i]; lock (_thread_msg_dic) _thread_msg_dic[Thread.CurrentThread] = "d" + child.depth + "::" + child.url; crawl(ref child); } }
/// <summary> /// 获取未锁定的url节点. /// </summary> /// <param name="root">树根</param> /// <returns></returns> private static node find_nolock_node(node root) { lock (root) { if (!root.locked) return root; return root.children.Count == 0 ? null : root.children.Select(find_nolock_node).FirstOrDefault(f => f != null); } }
/// <summary> /// 获取资源, 构建和遍历和url树. /// </summary> /// <param name="start_node">起始节点</param> private static void crawl(ref node start_node) { //锁定node lock (start_node) { if (start_node.locked) { return; } start_node.locked = true; } lock (_log) _log.InfoFormat("requesting\t{0}", start_node.url); { string res = get_res_text(start_node.url); if (res == null) { //str容器未得到设置, 标记node无效. start_node.valid = false; return; } string title = null; Match m = _regex_title.Match(res); //获取html中<title>内容 if (m.Success) { title = m.Groups[1].Value.Replace("\n", "").Replace("\r", "").Trim(); } //写入数据库 _dbm.write_to_db(start_node.url, title, res, DateTime.Now); ++prog_stat.res_stored; //当前爬行深度已达到限制, 不再增加child. if (start_node.depth >= _opts._crawl_depth) { return; } //匹配html中的href,将这些url作为自己的children集合. MatchCollection mc = _regex_href.Matches(res); IEnumerable <string> mc_str = (mc.AsParallel() as IEnumerable <Object>).Select(f => (f as Match).Groups[1].Value).Distinct(); foreach (string href in mc_str) { string child_url = filter.filter_href(href, start_node.url); if (child_url == null) { continue; } node child = new node { url = child_url, parent = start_node, depth = start_node.depth + 1 }; //深度+1 if (isdup(child_url)) { continue; //url重复检查. } _visited_url.Add(child_url.GetHashCode(), null); lock (start_node) start_node.children.Add(child); //添加children,这些新的children是未lock状态, 会马上被空闲的线程抢到并遍历. } } //释放res //遍历child for (int i = 0; i != start_node.children.Count; ++i) { node child = start_node.children[i]; lock (_thread_msg_dic) _thread_msg_dic[Thread.CurrentThread] = "d" + child.depth + "::" + child.url; crawl(ref child); } }