Ejemplo n.º 1
0
        private void DoScrape(string siteUsername, string sitePassword, string source)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    DoLogon(driver, siteUsername, sitePassword);

                    string url = "https://www.kryssord.org";
                    driver.Navigate().GoToUrl(url);
                    var documentNode = driver.GetDocumentNode();
                    ProcessWordsUntilEmpty(driver, db, adminUser, documentNode);
                }
            }
        }
Ejemplo n.º 2
0
        private void DoScrape(int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    // set general timeout to long
                    driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(180);

                    // read all words with the letter count
                    ReadWordsByWordPermutations(startLetterCount, endLetterCount, driver, db, adminUser, doContinueWithLastWord);
                }
            }
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            var configuration = new ConfigurationBuilder()
                                .SetBasePath(Directory.GetCurrentDirectory())
                                .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
                                .AddJsonFile("appsettings.Development.json", optional: true, reloadOnChange: true)
                                .AddCommandLine(args)
                                .AddEnvironmentVariables()
                                .Build();

            Log.Logger = new LoggerConfiguration()
                         .ReadFrom.Configuration(configuration)
                         // .MinimumLevel.Debug() // enable ef core logging
                         // .MinimumLevel.Information() // disable ef core logging
                         // .WriteTo.File(DEFAULT_LOG_PATH)
                         // .WriteTo.Console()
                         // .WriteTo.Console(restrictedToMinimumLevel: LogEventLevel.Information)
                         // .WriteTo.Logger(l => l.Filter.ByIncludingOnly(e => e.Level == LogEventLevel.Error).WriteTo.File(DEFAULT_ERROR_LOG_PATH))
                         .CreateLogger();

            var signalRHubURL = configuration["SignalRHubURL"] ?? "http://localhost:8000/crosswordsignalrhub";

            // start DOCKER on port 3360
            // docker run -p 3360:3306 --name mysqldb -e MYSQL_ROOT_PASSWORD=password -d mysql:8.0.15

            // Build database connection string
            var dbhost     = configuration["DBHOST"] ?? "localhost";
            var dbport     = configuration["DBPORT"] ?? "3306";
            var dbuser     = configuration["DBUSER"] ?? "user";
            var dbpassword = configuration["DBPASSWORD"] ?? "password";
            var database   = configuration["DATABASE"] ?? "dictionary";

            string connectionString = $"server={dbhost}; user={dbuser}; pwd={dbpassword}; "
                                      + $"port={dbport}; database={database}; charset=utf8;";

            string siteUsername = configuration["kryssord.org:Username"];
            string sitePassword = configuration["kryssord.org:Password"];

            Log.Error("Starting CrossWord.Scraper - retrieving database ....");

            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                // setup database
                // You would either call EnsureCreated() or Migrate().
                // EnsureCreated() is an alternative that completely skips the migrations pipeline and just creates a database that matches you current model.
                // It's good for unit testing or very early prototyping, when you are happy just to delete and re-create the database when the model changes.
                // db.Database.EnsureDeleted();
                // db.Database.EnsureCreated();

                // Note! Therefore don't use EnsureDeleted() and EnsureCreated() but Migrate();
                db.Database.Migrate();
            }

            // make sure that no chrome and chrome drivers are running
            ChromeDriverUtils.KillAllChromeDriverInstances();

            // read inn scraper info from environment variables (docker-compose)
            string scraperSite                = configuration["ScraperSite"] ?? "Kryssord";
            bool   doContinueWithLastWord     = configuration.GetBoolValue("ScraperContinueLastWord", true);
            int    startLetterCount           = configuration.GetIntValue("ScraperStartLetterCount", 1);
            int    endLetterCount             = configuration.GetIntValue("ScraperEndLetterCount", 20);
            bool   isScraperSwarm             = configuration.GetBoolValue("ScraperSwarm", true);
            bool   isKryssordLatest           = configuration.GetBoolValue("KryssordLatest", false);
            int    kryssordLatestDelaySeconds = configuration.GetIntValue("KryssordLatestDelaySeconds", 60);

            Log.Error("Using scraper config - site: '{0}', continue with last word: '{1}', from/to letter count: {2}-{3}. Swarming: {4}", scraperSite, doContinueWithLastWord, startLetterCount, endLetterCount, isScraperSwarm);

            // start several scrapers in parallell
            var options = new ParallelOptions();

            // options.MaxDegreeOfParallelism = 50; // seems to work better without a MaxDegreeOfParallelism number

#if DEBUG
            startLetterCount           = 4;
            endLetterCount             = 8;
            scraperSite                = "Kryssord";
            isScraperSwarm             = false;
            isKryssordLatest           = true;
            kryssordLatestDelaySeconds = 30;
#endif

            if (isScraperSwarm)
            {
                // using Parallel.ForEach
                var actionsList = new List <Action>();
                for (int i = startLetterCount; i <= endLetterCount; i++)
                {
                    int local_i = i; // have to use local i to not use the same increment on all scrapers
                    switch (scraperSite)
                    {
                    default:
                    case "Kryssord":
                        actionsList.Add(() => { new KryssordScraper(connectionString, signalRHubURL, siteUsername, sitePassword, local_i, endLetterCount, doContinueWithLastWord, isScraperSwarm); });
                        break;

                    case "KryssordHjelp":
                        actionsList.Add(() => { new KryssordHjelpScraper(connectionString, signalRHubURL, local_i, doContinueWithLastWord); });
                        break;

                    case "GratisKryssord":
                        actionsList.Add(() => { new GratisKryssordScraper(connectionString, signalRHubURL, local_i, endLetterCount, doContinueWithLastWord); });
                        break;

                    case "NorwegianSynonyms":
                        actionsList.Add(() => { new NorwegianSynonymsScraper(connectionString, signalRHubURL, local_i, endLetterCount, doContinueWithLastWord); });
                        break;
                    }
                }

                // check if we should add a separate thread for kryssord.org latest
                if (isKryssordLatest)
                {
                    Log.Error("Adding a separate swarm thread for kryssord.org latest");
                    actionsList.Add(() => { new KryssordScraperLatest(connectionString, signalRHubURL, siteUsername, sitePassword, kryssordLatestDelaySeconds); });
                }

                Parallel.ForEach <Action>(actionsList, options, (o => o()));
            }
            else
            {
                // check if we should add a separate thread for kryssord.org latest
                if (isKryssordLatest)
                {
                    Log.Error("Running kryssord.org latest");
                    new KryssordScraperLatest(connectionString, signalRHubURL, siteUsername, sitePassword, kryssordLatestDelaySeconds);
                }
                else
                {
                    // run only one thread
                    switch (scraperSite)
                    {
                    default:
                    case "Kryssord":
                        new KryssordScraper(connectionString, signalRHubURL, siteUsername, sitePassword, startLetterCount, endLetterCount, doContinueWithLastWord, false);
                        break;

                    case "KryssordHjelp":
                        new KryssordHjelpScraper(connectionString, signalRHubURL, startLetterCount, doContinueWithLastWord);
                        break;

                    case "GratisKryssord":
                        new GratisKryssordScraper(connectionString, signalRHubURL, startLetterCount, endLetterCount, doContinueWithLastWord);
                        break;

                    case "NorwegianSynonyms":
                        new NorwegianSynonymsScraper(connectionString, signalRHubURL, startLetterCount, endLetterCount, doContinueWithLastWord);
                        break;
                    }
                }
            }
        }
Ejemplo n.º 4
0
        private void DoScrape(int letterCount, string source, bool doContinueWithLastWord)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                string lastWordString = null;
                if (doContinueWithLastWord)
                {
                    lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, letterCount);
                }

                // if we didn't get back a word, use a pattern instead
                if (lastWordString == null)
                {
                    switch (letterCount)
                    {
                    case 1:
                        lastWordString = "a";
                        break;

                    case 2:
                        lastWordString = "aa";
                        break;

                    default:
                        lastWordString = "aa" + new string('?', letterCount - 2);
                        break;
                    }

                    Log.Information("Could not find any words having '{0}' letters. Therefore using last word pattern '{1}'.", letterCount, lastWordString);
                }

                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    // read all words with the letter count
                    ReadWordsByWordPermutations(letterCount, driver, db, adminUser, lastWordString);
                }
            }
        }
Ejemplo n.º 5
0
        private void DoScrape(string siteUsername, string sitePassword, int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord, bool isScraperSwarm)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

#if DEBUG
                // some patterns give back a word with one less character than asked for - it seems the Ø is messing their system up
                // UTF8 two byte problem?
                // TROND?K?????         gives TROND KJØLL
                // VEBJØRN?B????        gives VEBJØRN BERG
                // WILLY?R????????      gives WILLY RØGEBERG
                // THORBJØRN?H???????   gives THORBJØRN HÅRSTAD

                // lastWordString = "TRONSMOS VEG"; // word before TROND KJØLL
                // letterCount = 12;

                // lastWordString = "ÅSTED FOR DRAMAET ROMEO OG JULIE";
                // letterCount = 32;

                // lastWordString = "GUTTENAVN PÅ \"A\"";
                // letterCount = 16;
                // endLetterCount = 17;

                // lastWordString = "TALL SOM ANGIR FORHOLDET MELLOM ET LEGEMES HASTIGHET OG LYDENS";
                // lastWordString = "ÅPNINGSKONSERTSTYKKE";
                // letterCount = lastWordString.Length;
                // endLetterCount = 300;
#endif


                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    DoLogon(driver, siteUsername, sitePassword);

                    for (int i = startLetterCount; i < endLetterCount; i++)
                    {
                        // reset global variables
                        hasFoundPattern   = false; // this is the first stage, we match the pattern
                        hasFoundLastWord  = false; // this is the second stage, we not only match the pattern but the word as well
                        hasMissedLastWord = false;

                        string lastWordString = null;
                        if (doContinueWithLastWord)
                        {
                            lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, i);
                        }

                        // don't skip any words when the last word is empty
                        if (lastWordString == null)
                        {
                            hasFoundLastWord = true;
                        }

                        // added break to support several docker instances scraping in swarms
                        if (isScraperSwarm && (i > startLetterCount))
                        {
                            Log.Error("Warning! Quitting since the current letter length > letter count: {0} / {1}", i, startLetterCount);
                            break;
                        }

                        ReadWordsByWordPermutations(i, driver, db, adminUser, lastWordString);
                    }
                }
            }
        }