private void DoScrape(string siteUsername, string sitePassword, string source) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { DoLogon(driver, siteUsername, sitePassword); string url = "https://www.kryssord.org"; driver.Navigate().GoToUrl(url); var documentNode = driver.GetDocumentNode(); ProcessWordsUntilEmpty(driver, db, adminUser, documentNode); } } }
private void DoScrape(int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { // set general timeout to long driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(180); // read all words with the letter count ReadWordsByWordPermutations(startLetterCount, endLetterCount, driver, db, adminUser, doContinueWithLastWord); } } }
static void Main(string[] args) { var configuration = new ConfigurationBuilder() .SetBasePath(Directory.GetCurrentDirectory()) .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) .AddJsonFile("appsettings.Development.json", optional: true, reloadOnChange: true) .AddCommandLine(args) .AddEnvironmentVariables() .Build(); Log.Logger = new LoggerConfiguration() .ReadFrom.Configuration(configuration) // .MinimumLevel.Debug() // enable ef core logging // .MinimumLevel.Information() // disable ef core logging // .WriteTo.File(DEFAULT_LOG_PATH) // .WriteTo.Console() // .WriteTo.Console(restrictedToMinimumLevel: LogEventLevel.Information) // .WriteTo.Logger(l => l.Filter.ByIncludingOnly(e => e.Level == LogEventLevel.Error).WriteTo.File(DEFAULT_ERROR_LOG_PATH)) .CreateLogger(); var signalRHubURL = configuration["SignalRHubURL"] ?? "http://localhost:8000/crosswordsignalrhub"; // start DOCKER on port 3360 // docker run -p 3360:3306 --name mysqldb -e MYSQL_ROOT_PASSWORD=password -d mysql:8.0.15 // Build database connection string var dbhost = configuration["DBHOST"] ?? "localhost"; var dbport = configuration["DBPORT"] ?? "3306"; var dbuser = configuration["DBUSER"] ?? "user"; var dbpassword = configuration["DBPASSWORD"] ?? "password"; var database = configuration["DATABASE"] ?? "dictionary"; string connectionString = $"server={dbhost}; user={dbuser}; pwd={dbpassword}; " + $"port={dbport}; database={database}; charset=utf8;"; string siteUsername = configuration["kryssord.org:Username"]; string sitePassword = configuration["kryssord.org:Password"]; Log.Error("Starting CrossWord.Scraper - retrieving database ...."); var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { // setup database // You would either call EnsureCreated() or Migrate(). // EnsureCreated() is an alternative that completely skips the migrations pipeline and just creates a database that matches you current model. // It's good for unit testing or very early prototyping, when you are happy just to delete and re-create the database when the model changes. // db.Database.EnsureDeleted(); // db.Database.EnsureCreated(); // Note! Therefore don't use EnsureDeleted() and EnsureCreated() but Migrate(); db.Database.Migrate(); } // make sure that no chrome and chrome drivers are running ChromeDriverUtils.KillAllChromeDriverInstances(); // read inn scraper info from environment variables (docker-compose) string scraperSite = configuration["ScraperSite"] ?? "Kryssord"; bool doContinueWithLastWord = configuration.GetBoolValue("ScraperContinueLastWord", true); int startLetterCount = configuration.GetIntValue("ScraperStartLetterCount", 1); int endLetterCount = configuration.GetIntValue("ScraperEndLetterCount", 20); bool isScraperSwarm = configuration.GetBoolValue("ScraperSwarm", true); bool isKryssordLatest = configuration.GetBoolValue("KryssordLatest", false); int kryssordLatestDelaySeconds = configuration.GetIntValue("KryssordLatestDelaySeconds", 60); Log.Error("Using scraper config - site: '{0}', continue with last word: '{1}', from/to letter count: {2}-{3}. Swarming: {4}", scraperSite, doContinueWithLastWord, startLetterCount, endLetterCount, isScraperSwarm); // start several scrapers in parallell var options = new ParallelOptions(); // options.MaxDegreeOfParallelism = 50; // seems to work better without a MaxDegreeOfParallelism number #if DEBUG startLetterCount = 4; endLetterCount = 8; scraperSite = "Kryssord"; isScraperSwarm = false; isKryssordLatest = true; kryssordLatestDelaySeconds = 30; #endif if (isScraperSwarm) { // using Parallel.ForEach var actionsList = new List <Action>(); for (int i = startLetterCount; i <= endLetterCount; i++) { int local_i = i; // have to use local i to not use the same increment on all scrapers switch (scraperSite) { default: case "Kryssord": actionsList.Add(() => { new KryssordScraper(connectionString, signalRHubURL, siteUsername, sitePassword, local_i, endLetterCount, doContinueWithLastWord, isScraperSwarm); }); break; case "KryssordHjelp": actionsList.Add(() => { new KryssordHjelpScraper(connectionString, signalRHubURL, local_i, doContinueWithLastWord); }); break; case "GratisKryssord": actionsList.Add(() => { new GratisKryssordScraper(connectionString, signalRHubURL, local_i, endLetterCount, doContinueWithLastWord); }); break; case "NorwegianSynonyms": actionsList.Add(() => { new NorwegianSynonymsScraper(connectionString, signalRHubURL, local_i, endLetterCount, doContinueWithLastWord); }); break; } } // check if we should add a separate thread for kryssord.org latest if (isKryssordLatest) { Log.Error("Adding a separate swarm thread for kryssord.org latest"); actionsList.Add(() => { new KryssordScraperLatest(connectionString, signalRHubURL, siteUsername, sitePassword, kryssordLatestDelaySeconds); }); } Parallel.ForEach <Action>(actionsList, options, (o => o())); } else { // check if we should add a separate thread for kryssord.org latest if (isKryssordLatest) { Log.Error("Running kryssord.org latest"); new KryssordScraperLatest(connectionString, signalRHubURL, siteUsername, sitePassword, kryssordLatestDelaySeconds); } else { // run only one thread switch (scraperSite) { default: case "Kryssord": new KryssordScraper(connectionString, signalRHubURL, siteUsername, sitePassword, startLetterCount, endLetterCount, doContinueWithLastWord, false); break; case "KryssordHjelp": new KryssordHjelpScraper(connectionString, signalRHubURL, startLetterCount, doContinueWithLastWord); break; case "GratisKryssord": new GratisKryssordScraper(connectionString, signalRHubURL, startLetterCount, endLetterCount, doContinueWithLastWord); break; case "NorwegianSynonyms": new NorwegianSynonymsScraper(connectionString, signalRHubURL, startLetterCount, endLetterCount, doContinueWithLastWord); break; } } } }
private void DoScrape(int letterCount, string source, bool doContinueWithLastWord) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, letterCount); } // if we didn't get back a word, use a pattern instead if (lastWordString == null) { switch (letterCount) { case 1: lastWordString = "a"; break; case 2: lastWordString = "aa"; break; default: lastWordString = "aa" + new string('?', letterCount - 2); break; } Log.Information("Could not find any words having '{0}' letters. Therefore using last word pattern '{1}'.", letterCount, lastWordString); } // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { // read all words with the letter count ReadWordsByWordPermutations(letterCount, driver, db, adminUser, lastWordString); } } }
private void DoScrape(string siteUsername, string sitePassword, int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord, bool isScraperSwarm) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; #if DEBUG // some patterns give back a word with one less character than asked for - it seems the Ø is messing their system up // UTF8 two byte problem? // TROND?K????? gives TROND KJØLL // VEBJØRN?B???? gives VEBJØRN BERG // WILLY?R???????? gives WILLY RØGEBERG // THORBJØRN?H??????? gives THORBJØRN HÅRSTAD // lastWordString = "TRONSMOS VEG"; // word before TROND KJØLL // letterCount = 12; // lastWordString = "ÅSTED FOR DRAMAET ROMEO OG JULIE"; // letterCount = 32; // lastWordString = "GUTTENAVN PÅ \"A\""; // letterCount = 16; // endLetterCount = 17; // lastWordString = "TALL SOM ANGIR FORHOLDET MELLOM ET LEGEMES HASTIGHET OG LYDENS"; // lastWordString = "ÅPNINGSKONSERTSTYKKE"; // letterCount = lastWordString.Length; // endLetterCount = 300; #endif using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { DoLogon(driver, siteUsername, sitePassword); for (int i = startLetterCount; i < endLetterCount; i++) { // reset global variables hasFoundPattern = false; // this is the first stage, we match the pattern hasFoundLastWord = false; // this is the second stage, we not only match the pattern but the word as well hasMissedLastWord = false; string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, i); } // don't skip any words when the last word is empty if (lastWordString == null) { hasFoundLastWord = true; } // added break to support several docker instances scraping in swarms if (isScraperSwarm && (i > startLetterCount)) { Log.Error("Warning! Quitting since the current letter length > letter count: {0} / {1}", i, startLetterCount); break; } ReadWordsByWordPermutations(i, driver, db, adminUser, lastWordString); } } } }