public IActionResult Crawl([FromQuery] string url) { ICrawl crawl = CrawlFactory.Create(url); ArticleEntity entity = crawl.Execute(); return(Json(entity)); }
public bool Run(string crawlPath, CancellationToken?cancellationToken = null) { try { AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(MyResolveEventHandler); crawlPath = FindCrawlDLL <ICrawl>(crawlPath); var loader = PluginLoader.CreateFromAssemblyFile( crawlPath, sharedTypes: new[] { typeof(ICrawl) }); var pluginType = loader .LoadDefaultAssembly() .GetTypes() .Where(t => typeof(ICrawl).IsAssignableFrom(t) && !t.IsAbstract) .FirstOrDefault(); // This assumes the implementation of IPlugin has a parameterless constructor ICrawl plugin = (ICrawl)Activator.CreateInstance(pluginType); bool result = plugin.ScrapeData(cancellationToken); return(result); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, "Plugin run failed"); return(false); } }
public Crawler(ICrawl crawl, ILogHandler log, IExaminerFactory examinerFactory, IEnumerable<IInspector> inspectors) { if (crawl == null) throw new ArgumentNullException(nameof(crawl)); if (log == null) throw new ArgumentNullException(nameof(log)); if (examinerFactory == null) throw new ArgumentNullException(nameof(examinerFactory)); if (inspectors == null) throw new ArgumentNullException(nameof(inspectors)); _crawl = crawl; _log = log; _examinerFactory = examinerFactory; _inspectors = inspectors.ToList(); }
public static ICrawl Create(string url) { Uri uri = new Uri(url); ICrawl crawl = null; switch (uri.Host) { case "www.cnblogs.com": case "cnblogs.com": crawl = new CnblogCrawl(url); break; default: crawl = new CnblogCrawl(url); break; } return(crawl); }
public void test() { try { AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(MyResolveEventHandler); var loaders = new List <PluginLoader>(); // create plugin loaders var pluginsDir = Path.Combine(Directory.GetCurrentDirectory(), "plugins"); foreach (var dir in DirSearch(pluginsDir)) { var pluginDll = Path.GetFileName(dir); var loader = PluginLoader.CreateFromAssemblyFile( dir, sharedTypes: new[] { typeof(ICrawl) }); loaders.Add(loader); } // Create an instance of plugin types foreach (var loader in loaders) { foreach (var pluginType in loader .LoadDefaultAssembly() .GetTypes() .Where(t => typeof(ICrawl).IsAssignableFrom(t) && !t.IsAbstract)) { // This assumes the implementation of IPlugin has a parameterless constructor ICrawl plugin = (ICrawl)Activator.CreateInstance(pluginType); Console.WriteLine($"Created plugin instance."); plugin.ScrapeData(); } } } catch (Exception e) { } }
public async Task <bool> Run(PluginRunningParameters pluginRunningParameters, CancellationToken?cancellationToken) { try { ICrawl crawlPlugin = CreateInstanceOfPlugin <ICrawl>(pluginRunningParameters.CrawlerPluginPath); crawlPlugin.ScrapeData(cancellationToken); bool result = false; if (pluginRunningParameters.IsProcessorAssigned()) { var productsData = crawlPlugin.GetData(); IProcess processPlugin = CreateInstanceOfPlugin <IProcess>(pluginRunningParameters.ProcessorPluginPath); result = await processPlugin.ProcessData(pluginRunningParameters.MerchantName, productsData, pluginRunningParameters.ProcessorSaveAction); } return(result); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, "Plugin run failed"); return(false); } }
static void Main(string[] args) { TheVergeCrawler vergeCrawler = new TheVergeCrawler(); ViceCrawler viceCrawler = new ViceCrawler(); ICrawl crawler = viceCrawler; var list = crawler.Crawl(); MongoCRUD db = new MongoCRUD("NewsCrawl"); foreach (var title in list) { db.InsertRecord("News", title); Console.WriteLine("Title: " + title.Title + Environment.NewLine + "Url: " + title.Url + Environment.NewLine + "Author: " + title.Author + Environment.NewLine + "Date of publication: " + title.DateOfPublication + Environment.NewLine); } Console.WriteLine(""); }
public void HandleCollate( List <string> datastores, bool silent, string?outdir, IConsole console) { // Abort if not configuration found if (this.config == null) { Console.WriteLine("Unable to read 'configuration.json' file. To create a configuration file, use the 'setup' action"); return; } //bool silentMode = silent ??= false; bool silentMode = silent; // Set output directory if none specified outdir ??= Path.Combine( Directory.GetCurrentDirectory(), "catalog.json"); Console.WriteLine($"Will write output to {outdir}"); // If no store specified then crawl them all if (datastores.Count == 0) { datastores = config.DataStores.Select(ds => ds.Name).ToList(); } Console.Write($"Plannig to crawl: "); foreach (var datastore in datastores) { Console.Write($"{datastore} "); } // Confirm crawl should continue char shouldCont = 'Y'; if (!silentMode) { Console.WriteLine(); Console.WriteLine("Continue? (Y|N)"); shouldCont = Convert.ToChar(Console.Read()); } if (shouldCont != 'Y') { Console.WriteLine("Aborting..."); return; } // Start crawling all specified data stores List <Metadata> middenMetadatas = new List <Metadata>(); foreach (string store in datastores) { var currStore = config .DataStores .FirstOrDefault(s => s.Name == store); if (currStore == null) { Console.WriteLine($"No data store with name {store} in config file"); continue; } Console.WriteLine($"Crawling Data Store: {currStore.Name}"); ICrawl crawler = null; switch (currStore.Type) { case DataStoreTypes.LocalFileSystem: if (currStore.Path is not null) { crawler = new LocalFileSystemCrawler( currStore.Path); } else { Console.WriteLine( $"Not enough information provided to crawl {currStore.Name}"); } break; case DataStoreTypes.AzureDataLakeGen2: if ( currStore.AccountName is not null && currStore.TenantId is not null && currStore.ClientId is not null && currStore.ClientSecret is not null && currStore.AzureFileSystemName is not null) { crawler = new AzureDataLakeCrawler( currStore.AccountName, currStore.TenantId, currStore.ClientId, currStore.ClientSecret, currStore.AzureFileSystemName); } else { Console.WriteLine( $"Not enough information provided to crawl {currStore.Name}"); } break;