static void Main(string[] args) { var options = new Options(); if (!Parser.Default.ParseArguments(args, options)) { System.Console.WriteLine("Simple use:"); System.Console.WriteLine(@"Crawler.Console.exe -s http:\\ya.ru -p c:\1"); System.Console.WriteLine("All property use:"); System.Console.WriteLine(@"Crawler.Console.exe -s http:\\ya.ru -p c:\1 -r -i -d 3"); return; } //TODO input checks var container = Configure(); var crawler = container.Resolve <CrawlerEngine>(); var crawlerTask = new CrawlerTask(new Uri(options.Uri), options.Path, new CrawlerTaskSettings { CrawlDepth = options.Depth, IgnoreOtherDomains = options.IgnoreOtherDomains, ReplaceUrlToLocal = options.ReplaceToLocal }); crawler.ProcessCrawlerTask(crawlerTask) .ContinueWith(t => crawler.SavePageToDisk(t.Result, crawlerTask)) .Wait(); }
/// <summary> /// Creates a new trigger instance. /// </summary> /// <param name="tasks">The tasks handler.</param> /// <param name="task">The task.</param> /// <param name="interval">The delay interval to a restart after a failure.</param> /// <param name="maximum">The maximum number of restarts.</param> public CrawlerTriggerTaskRestart(ICrawlerTasks tasks, CrawlerTask task, TimeSpan interval, uint maximum) : base(tasks, task) { this.interval = interval; this.maximum = maximum; this.count = 0; }
private bool execute(CrawlerTask task, string file) { try { File.Open(file, FileMode.Open); task(file); return(true); } catch (FileNotFoundException e) { return(true); } catch (IOException e) { return(false); } }
private void enQueue(CrawlerTask task, String file) { //check for duplicates foreach (var tuple in queue) { var run = true; if (String.Compare(tuple.Item2, file, true) == 0) { if (tuple.Item1 == task) { if (execute(tuple.Item1, tuple.Item2)) { queue.Remove(tuple); } return; } else { if (!execute(tuple.Item1, tuple.Item2)) { run = false; } else { queue.Remove(tuple); } } } if (run) { if (execute(task, file)) { return; } } queue.Add(new Tuple <CrawlerTask, string>(task, file)); } queue.Add(Tuple.Create(task, file)); if (!timer.Enabled) { timer.Enabled = true; } }
public async void Start() { if (TaskList.Count > 0) { try { var watch = new Stopwatch(); for (int i = 0; i < TaskList.Count; i++) { CrawlerTask task = TaskList[i]; try { string taskname = task.TaskName; restart : watch.Restart(); if (task.NeedNext) { taskname = task.TaskName + $"(分页{task.StartPage})"; } Console.WriteLine($"----------------------开始任务:{CrawlerType + "--" + taskname}----------------------------"); string html = await WebCrawler.GetTargetHtmlString(); if (!string.IsNullOrWhiteSpace(html)) { int count = await WebCrawler.GetResultContent(html); watch.Stop(); if (count <= 0) { Console.WriteLine($"---------------结束任务{CrawlerType + "--" + taskname}:没有匹配的数据---------------"); logger.Warn($"结束任务{CrawlerType + "--" + taskname}:没有匹配的数据"); break; } var milliseconds = watch.ElapsedMilliseconds; Console.WriteLine($"----------------------{CrawlerType + "--" + taskname},共采集{count}条数据,用时:{milliseconds}----------------------------"); logger.Info($"{CrawlerType + "--" + taskname}共采集{count}条数据,用时:{milliseconds}"); if (task.NeedNext && task.StartPage < task.EndPage) { Thread.Sleep(5000); WebCrawler.GetNextPageUrl(++task.StartPage); goto restart; } } else { Console.WriteLine($"---------------结束任务{CrawlerType + "--" + taskname}:目标地址html获取失败---------------"); logger.Warn($"结束任务{CrawlerType + "--" + taskname}:目标地址html获取失败"); break; } } catch (Exception ex) { Console.WriteLine($"{CrawlerType + "--" + task.TaskName} 异常结束任务 " + ex.ToString()); logger.Error($"{CrawlerType + "--" + task.TaskName} 异常结束任务 " + ex.ToString()); } Thread.Sleep(5000); } } catch (Exception ex) { Console.WriteLine($"{CrawlerType} 异常结束任务" + ex.ToString()); logger.Error(ex.ToString()); } } }
/// <summary> /// Creates a new trigger instance. /// </summary> /// <param name="tasks">The tasks handler.</param> /// <param name="task">The task.</param> public CrawlerTriggerTask(ICrawlerTasks tasks, CrawlerTask task) : base(tasks) { // Set the task. this.task = task; }
/// <summary> /// Creates a new trigger instance. /// </summary> /// <param name="tasks">The tasks handler.</param> /// <param name="task">The task.</param> /// <param name="interval">The delay interval to a restart after a failure.</param> public CrawlerTriggerTaskStop(ICrawlerTasks tasks, CrawlerTask task, CrawlerTask.RunningTaskState state, TimeSpan interval) : base(tasks, task) { this.state = state; this.interval = interval; }