예제 #1
0
        /// <summary>
        /// Create a new spider to fetch data from some website
        /// See use examples on the Github page
        /// </summary>
        /// <param name="spiderName">A unique name for this spider. Folder will be created with that name</param>
        /// <param name="baseUri">The base Uri of the website. Pages outside this Host will not be fetched</param>
        /// <param name="params">Additional initialization parameters</param>
        public SimpleSpider(string spiderName, Uri baseUri, InitializationParams @params = null)
        {
            SpiderName = spiderName;
            BaseUri    = baseUri;

            Cacher     = @params?.Cacher;
            Downloader = @params?.Downloader;

            Configuration = @params?.ConfigurationPrototype ?? new Configuration();
            initializeConfiguration(spiderName, @params);

            LinkCollector = @params?.LinkCollector;
            if (Configuration.Auto_AnchorsLinks && LinkCollector == null)
            {
                LinkCollector = new LinkProcessors.SimpleProcessor();
            }

            initializeQueues();
            // initialize read-only
            if (Cacher == null)
            {
                Cacher = new ContentCacher();
            }
            if (Downloader == null)
            {
                Downloader = new HttpClientDownloader();
            }

            initializeFetchers();
            FetchCompleted += fetchCompleted_AutoCollect;
            FetchRewrite   += fetchRewrite_AutoRewrite;

            Parsers = new List <IParserBase>();
            if (@params?.Parsers != null)
            {
                Parsers.AddRange(@params.Parsers);
            }

            if (@params?.StorageEngine != null)
            {
                Storage = @params.StorageEngine;
                Storage.Initialize(Configuration);
            }

            logInitialStatus();
        }
예제 #2
0
        private void initializeConfiguration(string spiderName, InitializationParams init)
        {
            var dir = init?.SpiderDirectory;

            if (dir == null)
            {
                dir = new FileInfo(System.Reflection.Assembly.GetEntryAssembly().Location).Directory;
            }

            var spiderPath = new DirectoryInfo(Path.Combine(dir.FullName, spiderName));

            if (!spiderPath.Exists)
            {
                spiderPath.Create();
            }
            Configuration.SpiderDirectory = spiderPath;

            var dataPath = new DirectoryInfo(Path.Combine(spiderPath.FullName, "Data"));

            if (!dataPath.Exists)
            {
                dataPath.Create();
            }
            Configuration.SpiderDataDirectory = dataPath;

            spiderWorkDataPath = Path.Combine(dataPath.FullName, "privateData.xml");
            lock (lockDataPathObject)
            {
                SpiderWorkData = File.Exists(spiderWorkDataPath)
                                    ? SpiderWorkData = XmlSerializerHelper.DeserializeFromFile <SpiderData>(spiderWorkDataPath)
                                    : SpiderWorkData = new SpiderData();
            }

            if (Configuration.Logger == null)
            {
                Configuration.Spider_LogFile = Path.Combine(spiderPath.FullName, $"{ spiderName }.log");

                Configuration.Logger = new LoggerConfiguration()
                                       .MinimumLevel.Debug()
                                       .WriteTo.Console()
                                       .WriteTo.File(Configuration.Spider_LogFile, rollingInterval: RollingInterval.Day)
                                       .CreateLogger();
            }
        }
예제 #3
0
        private void initializeConfiguration(string spiderName, InitializationParams init)
        {
            var dir = init?.SpiderDirectory;

            if (dir == null)
            {
                dir = new FileInfo(System.Reflection.Assembly.GetEntryAssembly().Location).Directory;
            }

            var spiderPath = new DirectoryInfo(Path.Combine(dir.FullName, spiderName));

            if (!spiderPath.Exists)
            {
                spiderPath.Create();
            }
            Configuration.SpiderDirectory = spiderPath;

            var dataPath = new DirectoryInfo(Path.Combine(spiderPath.FullName, "Data"));

            if (!dataPath.Exists)
            {
                dataPath.Create();
            }
            Configuration.SpiderDataDirectory = dataPath;

            spiderWorkDataPath = Path.Combine(dataPath.FullName, "privateData.xml");
            lock (lockDataPathObject)
            {
                SpiderWorkData = File.Exists(spiderWorkDataPath)
                                    ? SpiderWorkData = XmlSerializerHelper.DeserializeFromFile <SpiderData>(spiderWorkDataPath)
                                    : SpiderWorkData = new SpiderData();
            }

            if (Configuration.Logger == null)
            {
                initializeWithSerilog(spiderName, spiderPath);
            }
        }