Exemple #1
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            Console.CancelKeyPress += ConsoleCancelKeyPress;

            Scheduler.Init(this);

            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            Downloader.ThreadNum = ThreadNum;

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }

            if (StartRequests != null)
            {
                if (StartRequests.Count > 0)
                {
                    Logger.Info($"添加网址到调度中心,数量: {StartRequests.Count}");
                    if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                    {
                        Parallel.ForEach(StartRequests, new ParallelOptions()
                        {
                            MaxDegreeOfParallelism = 4
                        }, request =>
                        {
                            Scheduler.Push(request, this);
                        });
                    }
                    else
                    {
                        QueueDuplicateRemovedScheduler scheduler = new QueueDuplicateRemovedScheduler();
                        Parallel.ForEach(StartRequests, new ParallelOptions()
                        {
                            MaxDegreeOfParallelism = 4
                        }, request =>
                        {
                            scheduler.PushWithoutRedialManager(request, this);
                        });
                        Scheduler.Load(scheduler.ToList(this), this);
                        ClearStartRequests();
                    }
                }
                else
                {
                    Logger.Info("不需要添加网址到调度中心.", true);
                }
            }

            _init = true;
        }
Exemple #2
0
    public WyamConfiguration(Engine engine, Build build) : base(engine)
    {
        var configurator = new Configurator(engine);

        configurator.Recipe = new Wyam.Docs.Docs();
        configurator.Theme  = "Samson";
        configurator.Configure("");
        configurator.AssemblyLoader.DirectAssemblies.Add(typeof(HtmlKeys).Assembly);
        configurator.AssemblyLoader.DirectAssemblies.Add(typeof(WebKeys).Assembly);
        configurator.AssemblyLoader.DirectAssemblies.Add(typeof(FeedKeys).Assembly);
        configurator.AssemblyLoader.DirectAssemblies.Add(typeof(CodeAnalysisKeys).Assembly);

        var assemblyFiles = build.PackageSpecs
                            .SelectMany(x => x.Assemblies)
                            .SelectMany(x => GlobFiles(NukeBuild.TemporaryDirectory / "_packages", x.TrimStart('/', '\\')))
                            .Distinct()
                            .Select(x => GetRelativePath(NukeBuild.RootDirectory / "input", x));

        // Logger.Info(string.Join(", ", assemblyFiles));

        Settings[DocsKeys.AssemblyFiles] = assemblyFiles;
        // Settings[DocsKeys.SolutionFiles] = GlobFiles(NukeBuild.TemporaryDirectory, "**/*.sln")
        //     .Select(x => GetRelativePath(NukeBuild.RootDirectory / "input", x));

        Settings[DocsKeys.Title]     = "Rocket Surgeons Guild";
        Settings[Keys.Host]          = "rocketsurgeonsguild.github.io/";
        Settings[Keys.LinksUseHttps] = true;
        // Settings[DocsKeys.SourceFiles] = GetRelativePath(NukeBuild.RootDirectory / "input", NukeBuild.TemporaryDirectory).TrimEnd('/') + "/*/src/**/{!bin,!obj,!packages,!*.Tests,}/**/*.cs";
        Settings[DocsKeys.IncludeDateInPostPath] = true;
        Settings[DocsKeys.BaseEditUrl]           = "https://github.com/RocketSurgeonsGuild/rocketsurgeonsguild.github.io/blob/dev/input/";

        Pipelines.InsertBefore(Docs.Code, "Package",
                               new ReadFiles(NukeBuild.RootDirectory.ToString() + "/packages/*.yml"),
                               new Yaml()
                               );

        Pipelines.InsertAfter("Package", "PackageCategories",
                              new GroupByMany((doc, _) => doc.List <string>("Categories"),
                                              new Documents("Package")
                                              ),
                              new Meta(Keys.WritePath, (doc, _) => new FilePath("packages/" + doc.String(Keys.GroupKey).ToLower().Replace(" ", "-") + "/index.html")),
                              new Meta(Keys.RelativeFilePath, (ctx, _) => ctx.FilePath(Keys.WritePath)),
                              new OrderBy((ctx, _) => ctx.String(Keys.GroupKey))
                              );

        Pipelines.Add("RenderPackage",
                      new Documents("PackageCategories"),
                      new Razor().WithLayout("/_PackageLayout.cshtml"),
                      new WriteFiles()
                      );
    }
Exemple #3
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            Console.CancelKeyPress += ConsoleCancelKeyPress;

            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (StartRequests != null && StartRequests.Count > 0)
            {
                Logger.Info($"添加链接到调度中心, 数量: {StartRequests.Count}.");
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Load(new HashSet <Request>(StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                Logger.Info("添加链接到调度中心, 数量: 0.");
            }

            _init = true;
        }
Exemple #4
0
        protected void InitComponent()
        {
            Scheduler.Init(this);

            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            Downloader.SetThreadNum(ThreadNum);

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }
            if (ThreadPool == null || ThreadPool.IsShutdown)
            {
                ThreadPool = new CountableThreadPool(ThreadNum);
            }
            if (StartRequests != null)
            {
                Parallel.ForEach(StartRequests, new ParallelOptions()
                {
                    MaxDegreeOfParallelism = 100
                }, request =>
                {
                    Scheduler.Push((Request)request.Clone(), this);
                });

                ClearStartRequests();
                Logger.InfoFormat("Push Request to Scheduler success.");
            }

            if (!_registConsoleCtrlHandler)
            {
                Console.Title             = Identify;
                Console.CancelKeyPress   += Console_CancelKeyPress;
                _registConsoleCtrlHandler = true;

                //根据控制台标题找控制台
                int windowHandler = FindWindow(null, Identify);
                //找关闭按钮
                IntPtr closeMenu = GetSystemMenu((IntPtr)windowHandler, IntPtr.Zero);
                int    SC_CLOSE  = 0xF060;
                //关闭按钮禁用
                RemoveMenu(closeMenu, SC_CLOSE, 0x0);
            }
        }
        protected override void InitPipelines(params string[] arguments)
        {
            if (Pipelines == null || Pipelines.Count == 0)
            {
                var defaultPipeline = GetDefaultPipeline();
                if (defaultPipeline != null)
                {
                    Pipelines.Add(defaultPipeline);
                }
            }

            if (!arguments.Contains("skip"))
            {
                var entityProcessors = PageProcessors.Where(p => p is IEntityProcessor).ToList();
                var entityPipelines  = Pipelines.Where(p => p is BaseEntityPipeline).ToList();

                if (entityProcessors.Count != 0 && entityPipelines.Count == 0)
                {
                    throw new SpiderException("You may miss a entity pipeline.");
                }
                foreach (var processor in entityProcessors)
                {
                    foreach (var pipeline in entityPipelines)
                    {
                        var entityProcessor = processor as IEntityProcessor;
                        if (pipeline is BaseEntityPipeline newPipeline)
                        {
                            if (entityProcessor != null)
                            {
                                newPipeline.AddEntity(entityProcessor.EntityDefine);
                            }
                        }
                    }
                }
            }

            if (PageProcessors == null || PageProcessors.Count == 0)
            {
                throw new SpiderException("Count of PageProcessor is zero.");
            }

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }
        }
Exemple #6
0
        /// <summary>
        /// Download urls synchronizing.
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="urls"></param>
        /// <returns></returns>
        public IList <T> GetAll <T>(params string[] urls)
        {
            DestroyWhenExit = false;
            SpawnUrl        = false;

            foreach (Request request in UrlUtils.ConvertToRequests(urls))
            {
                AddRequest(request);
            }
            ICollectorPipeline collectorPipeline = GetCollectorPipeline();

            Pipelines.Add(collectorPipeline);
            Run();
            SpawnUrl        = true;
            DestroyWhenExit = true;
            ICollection collection = collectorPipeline.GetCollected();

            return((from object o in collection select(T) o).ToList());
        }
		public SpiderContext AddPipeline(Configuration.Pipeline pipeline)
		{
			Pipelines.Add(pipeline);
			return this;
		}
Exemple #8
0
 /// <summary>
 /// Add a pipeline for Spider
 /// </summary>
 /// <param name="pipeline"></param>
 /// <returns></returns>
 public virtual Spider AddPipeline(IPipeline pipeline)
 {
     CheckIfRunning();
     Pipelines.Add(pipeline);
     return(this);
 }
 public void Add(Pipeline pipeline)
 {
     Pipelines.Add(pipeline);
 }
Exemple #10
0
        public void InitComponent()
        {
            if (_init)
            {
#if NET_CORE
                Logger.Info($"Component already init.", true);
#else
                Logger.Info("Component already init.");
#endif

                return;
            }

            Console.CancelKeyPress += ConsoleCancelKeyPress;

            Scheduler.Init(this);

            if (Downloader == null)
            {
                //Downloader = new HttpClientDownloader();
            }

            Downloader.ThreadNum = ThreadNum;

            if (Pipelines.Count == 0)
            {
                Pipelines.Add(new FilePipeline());
            }
            if (ThreadPool == null)
            {
                ThreadPool = new CountableThreadPool(ThreadNum);
            }

            if (StartRequests != null)
            {
                if (StartRequests.Count > 0)
                {
                    Parallel.ForEach(StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 100
                    }, request =>
                    {
                        Scheduler.Push((Request)request.Clone(), this);
                    });

                    ClearStartRequests();

#if NET_CORE
                    Logger.Info("Push Request to Scheduler success.", true);
#else
                    Logger.Info("Push Request to Scheduler success.");
#endif
                }
                else
                {
#if NET_CORE
                    Logger.Info("Push Zero Request to Scheduler.", true);
#else
                    Logger.Info("Push Request to Scheduler success.");
#endif
                }
            }

            Task.Factory.StartNew(() =>
            {
                if (ShowConsoleStatus)
                {
                    IMonitorableScheduler monitor = Scheduler as IMonitorableScheduler;
                    if (monitor != null)
                    {
                        while (true)
                        {
                            try
                            {
                                if (Stat == Status.Running && !_waitingToExit)
                                {
                                    Console.WriteLine(
                                        $"Left: {monitor.GetLeftRequestsCount(this)} Total: {monitor.GetTotalRequestsCount(this)} AliveThread: {ThreadPool.ThreadAlive} ThreadNum: {ThreadPool.ThreadNum}");
                                }
                            }
                            catch
                            {
                                // ignored
                            }
                            Thread.Sleep(2000);
                        }
                    }
                }
            });

            _init = true;
        }
        public override void Run(params string[] arguments)
        {
            InitEnvorimentAndVerify();

            try
            {
#if !NET_CORE
                if (CookieInterceptor != null)
                {
                    this.Log("尝试获取 Cookie...", LogLevel.Info);
                    var cookie = CookieInterceptor.GetCookie();
                    if (cookie == null)
                    {
                        this.Log("获取 Cookie 失败, 爬虫无法继续.", LogLevel.Warn);
                        return;
                    }
                    else
                    {
                        Site.CookiesStringPart = cookie.CookiesStringPart;
                        Site.Cookies           = cookie.CookiesDictionary;
                    }
                }
#endif

                this.Log("创建爬虫...", LogLevel.Info);
                EntityProcessor processor = new EntityProcessor(this);

                foreach (var entity in Entities)
                {
                    processor.AddEntity(entity);
                }
                PageProcessor = processor;
                foreach (var entity in Entities)
                {
                    string entiyName = entity.Entity.Name;
                    var    pipelines = new List <BaseEntityPipeline>();
                    foreach (var pipeline in EntityPipelines)
                    {
                        var newPipeline = pipeline.Clone();
                        newPipeline.InitiEntity(entity);
                        if (newPipeline.IsEnabled)
                        {
                            pipelines.Add(newPipeline);
                        }
                    }
                    if (pipelines.Count > 0)
                    {
                        Pipelines.Add(new EntityPipeline(entiyName, pipelines));
                    }
                }

                CheckIfSettingsCorrect();

                bool   needInitStartRequest = true;
                string key = "locker-" + Identity;
                if (Db != null)
                {
                    while (!Db.LockTake(key, "0", TimeSpan.FromMinutes(10)))
                    {
                        Thread.Sleep(1000);
                    }
                    var lockerValue = Db.HashGet(InitStatusSetName, Identity);
                    needInitStartRequest = lockerValue != "init finished";
                }

                if (arguments.Contains("rerun"))
                {
                    Scheduler.Init(this);
                    Scheduler.Clear();
                    //DELETE verify record.
                    Db?.HashDelete(ValidateStatusName, Identity);
                    needInitStartRequest = true;
                }

                this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info);
                InitComponent();

                if (needInitStartRequest)
                {
                    if (PrepareStartUrls != null)
                    {
                        for (int i = 0; i < PrepareStartUrls.Length; ++i)
                        {
                            var prepareStartUrl = PrepareStartUrls[i];
                            this.Log($"[步骤 {i + 2}] 添加链接到调度中心.", LogLevel.Info);
                            prepareStartUrl.Build(this, null);
                        }
                    }
                }

                MonitorCenter.Register(this);

                Db?.LockRelease(key, 0);

                RegisterControl(this);

                if (!arguments.Contains("running-test"))
                {
                    base.Run();
                }
                else
                {
                    IsExited = true;
                }

                TaskFinished();

                HandleVerifyCollectData();
            }
            finally
            {
                Dispose();
                MonitorCenter.Dispose();
            }
        }