Example #1
0
        static void Main(string[] args)
        {
            var options = new Options();

            if (!Parser.Default.ParseArguments(args, options))
            {
                System.Console.WriteLine("Simple use:");
                System.Console.WriteLine(@"Crawler.Console.exe -s http:\\ya.ru -p c:\1");
                System.Console.WriteLine("All property use:");
                System.Console.WriteLine(@"Crawler.Console.exe -s http:\\ya.ru -p c:\1 -r -i -d 3");
                return;
            }
            //TODO input checks

            var container = Configure();
            var crawler   = container.Resolve <CrawlerEngine>();

            var crawlerTask = new CrawlerTask(new Uri(options.Uri), options.Path,
                                              new CrawlerTaskSettings
            {
                CrawlDepth         = options.Depth,
                IgnoreOtherDomains = options.IgnoreOtherDomains,
                ReplaceUrlToLocal  = options.ReplaceToLocal
            });

            crawler.ProcessCrawlerTask(crawlerTask)
            .ContinueWith(t => crawler.SavePageToDisk(t.Result, crawlerTask))
            .Wait();
        }
 /// <summary>
 /// Creates a new trigger instance.
 /// </summary>
 /// <param name="tasks">The tasks handler.</param>
 /// <param name="task">The task.</param>
 /// <param name="interval">The delay interval to a restart after a failure.</param>
 /// <param name="maximum">The maximum number of restarts.</param>
 public CrawlerTriggerTaskRestart(ICrawlerTasks tasks, CrawlerTask task, TimeSpan interval, uint maximum)
     : base(tasks, task)
 {
     this.interval = interval;
     this.maximum = maximum;
     this.count = 0;
 }
Example #3
0
 private bool execute(CrawlerTask task, string file)
 {
     try
     {
         File.Open(file, FileMode.Open);
         task(file);
         return(true);
     }
     catch (FileNotFoundException e)
     {
         return(true);
     }
     catch (IOException e)
     {
         return(false);
     }
 }
Example #4
0
 private void enQueue(CrawlerTask task, String file)
 {
     //check for duplicates
     foreach (var tuple in queue)
     {
         var run = true;
         if (String.Compare(tuple.Item2, file, true) == 0)
         {
             if (tuple.Item1 == task)
             {
                 if (execute(tuple.Item1, tuple.Item2))
                 {
                     queue.Remove(tuple);
                 }
                 return;
             }
             else
             {
                 if (!execute(tuple.Item1, tuple.Item2))
                 {
                     run = false;
                 }
                 else
                 {
                     queue.Remove(tuple);
                 }
             }
         }
         if (run)
         {
             if (execute(task, file))
             {
                 return;
             }
         }
         queue.Add(new Tuple <CrawlerTask, string>(task, file));
     }
     queue.Add(Tuple.Create(task, file));
     if (!timer.Enabled)
     {
         timer.Enabled = true;
     }
 }
Example #5
0
        public async void Start()
        {
            if (TaskList.Count > 0)
            {
                try
                {
                    var watch = new Stopwatch();
                    for (int i = 0; i < TaskList.Count; i++)
                    {
                        CrawlerTask task = TaskList[i];
                        try
                        {
                            string taskname = task.TaskName;
                            restart : watch.Restart();
                            if (task.NeedNext)
                            {
                                taskname = task.TaskName + $"(分页{task.StartPage})";
                            }

                            Console.WriteLine($"----------------------开始任务:{CrawlerType + "--" + taskname}----------------------------");
                            string html = await WebCrawler.GetTargetHtmlString();

                            if (!string.IsNullOrWhiteSpace(html))
                            {
                                int count = await WebCrawler.GetResultContent(html);

                                watch.Stop();
                                if (count <= 0)
                                {
                                    Console.WriteLine($"---------------结束任务{CrawlerType + "--" + taskname}:没有匹配的数据---------------");
                                    logger.Warn($"结束任务{CrawlerType + "--" + taskname}:没有匹配的数据");
                                    break;
                                }
                                var milliseconds = watch.ElapsedMilliseconds;
                                Console.WriteLine($"----------------------{CrawlerType + "--" + taskname},共采集{count}条数据,用时:{milliseconds}----------------------------");
                                logger.Info($"{CrawlerType + "--" + taskname}共采集{count}条数据,用时:{milliseconds}");
                                if (task.NeedNext && task.StartPage < task.EndPage)
                                {
                                    Thread.Sleep(5000);
                                    WebCrawler.GetNextPageUrl(++task.StartPage);
                                    goto restart;
                                }
                            }
                            else
                            {
                                Console.WriteLine($"---------------结束任务{CrawlerType + "--" + taskname}:目标地址html获取失败---------------");
                                logger.Warn($"结束任务{CrawlerType + "--" + taskname}:目标地址html获取失败");
                                break;
                            }
                        }
                        catch (Exception ex) {
                            Console.WriteLine($"{CrawlerType + "--" + task.TaskName} 异常结束任务   " + ex.ToString());
                            logger.Error($"{CrawlerType + "--" + task.TaskName} 异常结束任务   " + ex.ToString());
                        }
                        Thread.Sleep(5000);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"{CrawlerType} 异常结束任务" + ex.ToString());
                    logger.Error(ex.ToString());
                }
            }
        }
 /// <summary>
 /// Creates a new trigger instance.
 /// </summary>
 /// <param name="tasks">The tasks handler.</param>
 /// <param name="task">The task.</param>
 public CrawlerTriggerTask(ICrawlerTasks tasks, CrawlerTask task)
     : base(tasks)
 {
     // Set the task.
     this.task = task;
 }
 /// <summary>
 /// Creates a new trigger instance.
 /// </summary>
 /// <param name="tasks">The tasks handler.</param>
 /// <param name="task">The task.</param>
 /// <param name="interval">The delay interval to a restart after a failure.</param>
 public CrawlerTriggerTaskStop(ICrawlerTasks tasks, CrawlerTask task, CrawlerTask.RunningTaskState state, TimeSpan interval)
     : base(tasks, task)
 {
     this.state = state;
     this.interval = interval;
 }