Esempio n. 1
0
        public IActionResult Crawl([FromQuery] string url)
        {
            ICrawl        crawl  = CrawlFactory.Create(url);
            ArticleEntity entity = crawl.Execute();

            return(Json(entity));
        }
Esempio n. 2
0
        public bool Run(string crawlPath, CancellationToken?cancellationToken = null)
        {
            try
            {
                AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(MyResolveEventHandler);

                crawlPath = FindCrawlDLL <ICrawl>(crawlPath);

                var loader = PluginLoader.CreateFromAssemblyFile(
                    crawlPath,
                    sharedTypes: new[] { typeof(ICrawl) });
                var pluginType = loader
                                 .LoadDefaultAssembly()
                                 .GetTypes()
                                 .Where(t => typeof(ICrawl).IsAssignableFrom(t) && !t.IsAbstract)
                                 .FirstOrDefault();

                // This assumes the implementation of IPlugin has a parameterless constructor
                ICrawl plugin = (ICrawl)Activator.CreateInstance(pluginType);

                bool result = plugin.ScrapeData(cancellationToken);
                return(result);
            }
            catch (Exception ex)
            {
                _logger.Log(LogLevel.Error, ex, "Plugin run failed");
                return(false);
            }
        }
Esempio n. 3
0
        public Crawler(ICrawl crawl, ILogHandler log, IExaminerFactory examinerFactory, IEnumerable<IInspector> inspectors)
        {
            if (crawl == null) throw new ArgumentNullException(nameof(crawl));
            if (log == null) throw new ArgumentNullException(nameof(log));
            if (examinerFactory == null) throw new ArgumentNullException(nameof(examinerFactory));
            if (inspectors == null) throw new ArgumentNullException(nameof(inspectors));

            _crawl = crawl;
            _log = log;
            _examinerFactory = examinerFactory;
            _inspectors = inspectors.ToList();
        }
Esempio n. 4
0
        public static ICrawl Create(string url)
        {
            Uri    uri   = new Uri(url);
            ICrawl crawl = null;

            switch (uri.Host)
            {
            case "www.cnblogs.com":
            case "cnblogs.com":
                crawl = new CnblogCrawl(url);
                break;

            default:
                crawl = new CnblogCrawl(url);
                break;
            }

            return(crawl);
        }
Esempio n. 5
0
        public void test()
        {
            try
            {
                AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(MyResolveEventHandler);

                var loaders = new List <PluginLoader>();

                // create plugin loaders
                var pluginsDir = Path.Combine(Directory.GetCurrentDirectory(), "plugins");
                foreach (var dir in DirSearch(pluginsDir))
                {
                    var pluginDll = Path.GetFileName(dir);

                    var loader = PluginLoader.CreateFromAssemblyFile(
                        dir,
                        sharedTypes: new[] { typeof(ICrawl) });
                    loaders.Add(loader);
                }

                // Create an instance of plugin types
                foreach (var loader in loaders)
                {
                    foreach (var pluginType in loader
                             .LoadDefaultAssembly()
                             .GetTypes()
                             .Where(t => typeof(ICrawl).IsAssignableFrom(t) && !t.IsAbstract))
                    {
                        // This assumes the implementation of IPlugin has a parameterless constructor
                        ICrawl plugin = (ICrawl)Activator.CreateInstance(pluginType);

                        Console.WriteLine($"Created plugin instance.");

                        plugin.ScrapeData();
                    }
                }
            }
            catch (Exception e)
            {
            }
        }
Esempio n. 6
0
        public async Task <bool> Run(PluginRunningParameters pluginRunningParameters, CancellationToken?cancellationToken)
        {
            try
            {
                ICrawl crawlPlugin = CreateInstanceOfPlugin <ICrawl>(pluginRunningParameters.CrawlerPluginPath);

                crawlPlugin.ScrapeData(cancellationToken);
                bool result = false;
                if (pluginRunningParameters.IsProcessorAssigned())
                {
                    var      productsData  = crawlPlugin.GetData();
                    IProcess processPlugin = CreateInstanceOfPlugin <IProcess>(pluginRunningParameters.ProcessorPluginPath);
                    result = await processPlugin.ProcessData(pluginRunningParameters.MerchantName, productsData, pluginRunningParameters.ProcessorSaveAction);
                }

                return(result);
            }
            catch (Exception ex)
            {
                _logger.Log(LogLevel.Error, ex, "Plugin run failed");
                return(false);
            }
        }
Esempio n. 7
0
        static void Main(string[] args)
        {
            TheVergeCrawler vergeCrawler = new TheVergeCrawler();
            ViceCrawler     viceCrawler  = new ViceCrawler();



            ICrawl crawler = viceCrawler;
            var    list    = crawler.Crawl();

            MongoCRUD db = new MongoCRUD("NewsCrawl");

            foreach (var title in list)
            {
                db.InsertRecord("News", title);
                Console.WriteLine("Title: " + title.Title + Environment.NewLine
                                  + "Url: " + title.Url + Environment.NewLine
                                  + "Author: " + title.Author + Environment.NewLine
                                  + "Date of publication: " + title.DateOfPublication + Environment.NewLine);
            }


            Console.WriteLine("");
        }
Esempio n. 8
0
        public void HandleCollate(
            List <string> datastores,
            bool silent,
            string?outdir,
            IConsole console)
        {
            // Abort if not configuration found
            if (this.config == null)
            {
                Console.WriteLine("Unable to read 'configuration.json' file. To create a configuration file, use the 'setup' action");
                return;
            }

            //bool silentMode = silent ??= false;
            bool silentMode = silent;

            // Set output directory if none specified
            outdir ??= Path.Combine(
                Directory.GetCurrentDirectory(), "catalog.json");
            Console.WriteLine($"Will write output to {outdir}");

            // If no store specified then crawl them all
            if (datastores.Count == 0)
            {
                datastores = config.DataStores.Select(ds => ds.Name).ToList();
            }

            Console.Write($"Plannig to crawl: ");
            foreach (var datastore in datastores)
            {
                Console.Write($"{datastore} ");
            }

            // Confirm crawl should continue
            char shouldCont = 'Y';

            if (!silentMode)
            {
                Console.WriteLine();
                Console.WriteLine("Continue? (Y|N)");
                shouldCont = Convert.ToChar(Console.Read());
            }

            if (shouldCont != 'Y')
            {
                Console.WriteLine("Aborting...");
                return;
            }

            // Start crawling all specified data stores
            List <Metadata> middenMetadatas = new List <Metadata>();

            foreach (string store in datastores)
            {
                var currStore = config
                                .DataStores
                                .FirstOrDefault(s => s.Name == store);

                if (currStore == null)
                {
                    Console.WriteLine($"No data store with name {store} in config file");
                    continue;
                }

                Console.WriteLine($"Crawling Data Store: {currStore.Name}");

                ICrawl crawler = null;
                switch (currStore.Type)
                {
                case DataStoreTypes.LocalFileSystem:
                    if (currStore.Path is not null)
                    {
                        crawler = new LocalFileSystemCrawler(
                            currStore.Path);
                    }
                    else
                    {
                        Console.WriteLine(
                            $"Not enough information provided to crawl {currStore.Name}");
                    }

                    break;

                case DataStoreTypes.AzureDataLakeGen2:
                    if (
                        currStore.AccountName is not null &&
                        currStore.TenantId is not null &&
                        currStore.ClientId is not null &&
                        currStore.ClientSecret is not null &&
                        currStore.AzureFileSystemName is not null)
                    {
                        crawler = new AzureDataLakeCrawler(
                            currStore.AccountName,
                            currStore.TenantId,
                            currStore.ClientId,
                            currStore.ClientSecret,
                            currStore.AzureFileSystemName);
                    }
                    else
                    {
                        Console.WriteLine(
                            $"Not enough information provided to crawl {currStore.Name}");
                    }

                    break;