Example #1
0
        private static void Main(string[] args)
        {
            //解析命令行参数
            ICommandLineParser parser
                = new CommandLineParser(new CommandLineParserSettings(Console.Error));
            if (!parser.ParseArguments(args, _opts))
                Environment.Exit(1);

            _root_node = new node { url = _opts._url_start };
            _dbm = new db_mgr(_opts._db_path, _opts._db_cache);

            //启动工作线程
            _thread_msg_dic = new Dictionary<Thread, string>(_opts._thread_cnt);
            for (int i = 0; i != _opts._thread_cnt; ++i)
            {
                Thread t = new Thread(t_work) { IsBackground = true, Name = "t" + i };
                lock (_thread_msg_dic)
                    _thread_msg_dic.Add(t, "started");
                t.Start();
            }

            Console.Title = _heading_info;
            while (true)
            {
                Console.Clear();
                Console.WriteLine(_opts.create_helptext(_heading_info));
                Console.WriteLine();
                Process proc_self = Process.GetCurrentProcess();
                Console.WriteLine("working time: {0}", (DateTime.Now - proc_self.StartTime));
                Console.WriteLine("mem usage: {0} KB", (proc_self.WorkingSet64 / 1024).ToString("###,###"));
                Console.WriteLine("res downloaded:{0}\tres stored:{1}", prog_stat.res_downloaded, prog_stat.res_stored);
                Console.WriteLine("threads:");
                lock (_thread_msg_dic)
                {
                    foreach (KeyValuePair<Thread, string> kv in _thread_msg_dic)
                        Console.WriteLine(kv.Key.Name + "::" + kv.Value);
                }
                Thread.Sleep(1000);
            }
        }
Example #2
0
        /// <summary>
        /// 获取资源, 构建和遍历和url树.
        /// </summary>
        /// <param name="start_node">起始节点</param>
        private static void crawl(ref node start_node)
        {
            //锁定node
            lock (start_node)
            {
                if (start_node.locked)
                    return;
                start_node.locked = true;
            }
            lock (_log)
                _log.InfoFormat("requesting\t{0}", start_node.url);

            {

                string res = get_res_text(start_node.url);

                if (res == null)
                {
                    //str容器未得到设置, 标记node无效.
                    start_node.valid = false;
                    return;
                }

                string title = null;
                Match m = _regex_title.Match(res); //获取html中<title>内容
                if (m.Success)
                    title = m.Groups[1].Value.Replace("\n", "").Replace("\r", "").Trim();
                //写入数据库
                _dbm.write_to_db(start_node.url, title, res, DateTime.Now);
                ++prog_stat.res_stored;

                //当前爬行深度已达到限制, 不再增加child.
                if (start_node.depth >= _opts._crawl_depth) return;

                //匹配html中的href,将这些url作为自己的children集合.
                MatchCollection mc = _regex_href.Matches(res);
                IEnumerable<string> mc_str =
                    (mc.AsParallel() as IEnumerable<Object>).Select(f => (f as Match).Groups[1].Value).Distinct();
                foreach (string href in mc_str)
                {
                    string child_url = filter.filter_href(href, start_node.url);
                    if (child_url == null) continue;
                    node child = new node { url = child_url, parent = start_node, depth = start_node.depth + 1 }; //深度+1
                    if (isdup(child_url)) continue; //url重复检查.
                    _visited_url.Add(child_url.GetHashCode(), null);
                    lock (start_node)
                        start_node.children.Add(child); //添加children,这些新的children是未lock状态, 会马上被空闲的线程抢到并遍历.
                }
            }//释放res

            //遍历child
            for (int i = 0; i != start_node.children.Count; ++i)
            {
                node child = start_node.children[i];
                lock (_thread_msg_dic)
                    _thread_msg_dic[Thread.CurrentThread] = "d" + child.depth + "::" + child.url;
                crawl(ref child);
            }
        }
Example #3
0
 /// <summary>
 /// 获取未锁定的url节点.
 /// </summary>
 /// <param name="root">树根</param>
 /// <returns></returns>
 private static node find_nolock_node(node root)
 {
     lock (root)
     {
         if (!root.locked) return root;
         return root.children.Count == 0
                 ? null
                 : root.children.Select(find_nolock_node).FirstOrDefault(f => f != null);
     }
 }
Example #4
0
        /// <summary>
        /// 获取资源, 构建和遍历和url树.
        /// </summary>
        /// <param name="start_node">起始节点</param>
        private static void crawl(ref node start_node)
        {
            //锁定node
            lock (start_node)
            {
                if (start_node.locked)
                {
                    return;
                }
                start_node.locked = true;
            }
            lock (_log)
                _log.InfoFormat("requesting\t{0}", start_node.url);

            {
                string res = get_res_text(start_node.url);

                if (res == null)
                {
                    //str容器未得到设置, 标记node无效.
                    start_node.valid = false;
                    return;
                }

                string title = null;
                Match  m     = _regex_title.Match(res);            //获取html中<title>内容
                if (m.Success)
                {
                    title = m.Groups[1].Value.Replace("\n", "").Replace("\r", "").Trim();
                }
                //写入数据库
                _dbm.write_to_db(start_node.url, title, res, DateTime.Now);
                ++prog_stat.res_stored;

                //当前爬行深度已达到限制, 不再增加child.
                if (start_node.depth >= _opts._crawl_depth)
                {
                    return;
                }

                //匹配html中的href,将这些url作为自己的children集合.
                MatchCollection      mc     = _regex_href.Matches(res);
                IEnumerable <string> mc_str =
                    (mc.AsParallel() as IEnumerable <Object>).Select(f => (f as Match).Groups[1].Value).Distinct();
                foreach (string href in mc_str)
                {
                    string child_url = filter.filter_href(href, start_node.url);
                    if (child_url == null)
                    {
                        continue;
                    }
                    node child = new node {
                        url = child_url, parent = start_node, depth = start_node.depth + 1
                    };                                                                                                                //深度+1
                    if (isdup(child_url))
                    {
                        continue;                                       //url重复检查.
                    }
                    _visited_url.Add(child_url.GetHashCode(), null);
                    lock (start_node)
                        start_node.children.Add(child);  //添加children,这些新的children是未lock状态, 会马上被空闲的线程抢到并遍历.
                }
            }                                            //释放res

            //遍历child
            for (int i = 0; i != start_node.children.Count; ++i)
            {
                node child = start_node.children[i];
                lock (_thread_msg_dic)
                    _thread_msg_dic[Thread.CurrentThread] = "d" + child.depth + "::" + child.url;
                crawl(ref child);
            }
        }