/// <summary> /// Create a new spider to fetch data from some website /// See use examples on the Github page /// </summary> /// <param name="spiderName">A unique name for this spider. Folder will be created with that name</param> /// <param name="baseUri">The base Uri of the website. Pages outside this Host will not be fetched</param> /// <param name="params">Additional initialization parameters</param> public SimpleSpider(string spiderName, Uri baseUri, InitializationParams @params = null) { SpiderName = spiderName; BaseUri = baseUri; Cacher = @params?.Cacher; Downloader = @params?.Downloader; Configuration = @params?.ConfigurationPrototype ?? new Configuration(); initializeConfiguration(spiderName, @params); LinkCollector = @params?.LinkCollector; if (Configuration.Auto_AnchorsLinks && LinkCollector == null) { LinkCollector = new LinkProcessors.SimpleProcessor(); } initializeQueues(); // initialize read-only if (Cacher == null) { Cacher = new ContentCacher(); } if (Downloader == null) { Downloader = new HttpClientDownloader(); } initializeFetchers(); FetchCompleted += fetchCompleted_AutoCollect; FetchRewrite += fetchRewrite_AutoRewrite; Parsers = new List <IParserBase>(); if (@params?.Parsers != null) { Parsers.AddRange(@params.Parsers); } if (@params?.StorageEngine != null) { Storage = @params.StorageEngine; Storage.Initialize(Configuration); } logInitialStatus(); }
private void initializeConfiguration(string spiderName, InitializationParams init) { var dir = init?.SpiderDirectory; if (dir == null) { dir = new FileInfo(System.Reflection.Assembly.GetEntryAssembly().Location).Directory; } var spiderPath = new DirectoryInfo(Path.Combine(dir.FullName, spiderName)); if (!spiderPath.Exists) { spiderPath.Create(); } Configuration.SpiderDirectory = spiderPath; var dataPath = new DirectoryInfo(Path.Combine(spiderPath.FullName, "Data")); if (!dataPath.Exists) { dataPath.Create(); } Configuration.SpiderDataDirectory = dataPath; spiderWorkDataPath = Path.Combine(dataPath.FullName, "privateData.xml"); lock (lockDataPathObject) { SpiderWorkData = File.Exists(spiderWorkDataPath) ? SpiderWorkData = XmlSerializerHelper.DeserializeFromFile <SpiderData>(spiderWorkDataPath) : SpiderWorkData = new SpiderData(); } if (Configuration.Logger == null) { Configuration.Spider_LogFile = Path.Combine(spiderPath.FullName, $"{ spiderName }.log"); Configuration.Logger = new LoggerConfiguration() .MinimumLevel.Debug() .WriteTo.Console() .WriteTo.File(Configuration.Spider_LogFile, rollingInterval: RollingInterval.Day) .CreateLogger(); } }
private void initializeConfiguration(string spiderName, InitializationParams init) { var dir = init?.SpiderDirectory; if (dir == null) { dir = new FileInfo(System.Reflection.Assembly.GetEntryAssembly().Location).Directory; } var spiderPath = new DirectoryInfo(Path.Combine(dir.FullName, spiderName)); if (!spiderPath.Exists) { spiderPath.Create(); } Configuration.SpiderDirectory = spiderPath; var dataPath = new DirectoryInfo(Path.Combine(spiderPath.FullName, "Data")); if (!dataPath.Exists) { dataPath.Create(); } Configuration.SpiderDataDirectory = dataPath; spiderWorkDataPath = Path.Combine(dataPath.FullName, "privateData.xml"); lock (lockDataPathObject) { SpiderWorkData = File.Exists(spiderWorkDataPath) ? SpiderWorkData = XmlSerializerHelper.DeserializeFromFile <SpiderData>(spiderWorkDataPath) : SpiderWorkData = new SpiderData(); } if (Configuration.Logger == null) { initializeWithSerilog(spiderName, spiderPath); } }