private void ProcessSynonymsUntilEmpty(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, int page, HtmlNode documentNode, string url) { while (true) { Log.Information("Processing synonym search for '{0}' on page {1}", word.Value, page + 1); writer.WriteLine("Processing synonym search for '{0}' on page {1}", word.Value, page + 1); // parse all related words var relatedWords = ReadRelatedWordsAgilityPack(documentNode, adminUser); // and add to database // don't update state WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer, false); // go to next page if exist // Note! this only works if we are logged in var(hasFoundNextPage, pageNumber, pageUrl, pageNode) = NavigateToNextPageIfExist(driver, documentNode); if (hasFoundNextPage) { url = pageUrl; page = pageNumber; documentNode = pageNode; } else { break; } } }
private void ReadWordsByWordUrl(string wordPrefix, string url, IWebDriver driver, WordHintDbContext db, User adminUser, string lastWord) { // go to word page try { driver.Navigate().GoToUrl(url); } catch (System.Exception) { // Log.Error("Timeout navigating to '{0}'", url); writer.WriteLine("Timeout navigating to '{0}'", url); return; } Log.Information("Processing word search for '{0}'", wordPrefix); writer.WriteLine("Processing word search for '{0}'", wordPrefix); // read the whole document into a HtmlNode HtmlNode doc = driver.GetDocumentNode(); // and parse using agility pack var words = ParseWordsAgilityPack(doc, adminUser); bool doSkip = true; foreach (var wordAndHref in words) { var word = wordAndHref.Item1; var href = wordAndHref.Item2; var wordText = word.Value; // skip until we get to the last word if (doSkip && lastWord != null && lastWord != wordText) { Log.Information("Skipping alphabetic word '{0}' until we find '{1}'", wordText, lastWord); writer.WriteLine("Skipping alphabetic word '{0}' until we find '{1}'", wordText, lastWord); continue; } doSkip = false; // make sure we don't skip on the next word after we have skipped // update that we are processing this word WordDatabaseService.UpdateState(db, source, new Word() { Value = wordText, Comment = wordPrefix, CreatedDate = DateTime.Now }, writer, true); GetWordSynonyms(word, driver, db, adminUser, href); } }
private void DoScrape(string source, bool doContinueWithLastWord) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromSource(db, source); } // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; ReadWordsFromUrl(db, adminUser, lastWordString); } }
private void DoScrape(int letterCount, string source, bool doContinueWithLastWord) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, letterCount); } // if we didn't get back a word, use a pattern instead if (lastWordString == null) { switch (letterCount) { case 1: lastWordString = "a"; break; case 2: lastWordString = "aa"; break; default: lastWordString = "aa" + new string('?', letterCount - 2); break; } Log.Information("Could not find any words having '{0}' letters. Therefore using last word pattern '{1}'.", letterCount, lastWordString); } // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { // read all words with the letter count ReadWordsByWordPermutations(letterCount, driver, db, adminUser, lastWordString); } } }
private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url) { // there is a bug in the website that makes a query with "0" fail if (word.Value == "0") { return; } // open a new tab and set the context var chromeDriver = (ChromeDriver)driver; // save a reference to our original tab's window handle var originalTabInstance = chromeDriver.CurrentWindowHandle; // execute some JavaScript to open a new window chromeDriver.ExecuteScript("window.open();"); // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1]; // switch our WebDriver to the new tab's window handle chromeDriver.SwitchTo().Window(newTabInstance); // lets navigate to a web site in our new tab driver.Navigate().GoToUrl(url); Log.Information("Processing synonym search for '{0}'", word.Value); writer.WriteLine("Processing synonym search for '{0}'", word.Value); // parse all synonyms IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li")); IWebElement ahref = null; var relatedWords = new List <Word>(); foreach (IWebElement listElement in listElements) { try { ahref = listElement.FindElement(By.TagName("a")); } catch (NoSuchElementException) { break; } var hintText = ahref.Text; var href = ahref.GetAttribute("href"); var hint = new Word { Language = "no", Value = hintText, NumberOfLetters = hintText.Count(c => c != ' '), NumberOfWords = ScraperUtils.CountNumberOfWords(hintText), User = adminUser, CreatedDate = DateTime.Now, Source = this.source }; relatedWords.Add(hint); } relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word> // and add to database WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer); // now lets close our new tab chromeDriver.ExecuteScript("window.close();"); // and switch our WebDriver back to the original tab's window handle chromeDriver.SwitchTo().Window(originalTabInstance); // and have our WebDriver focus on the main document in the page to send commands to chromeDriver.SwitchTo().DefaultContent(); }
private void ReadWordsByWordPermutations(int startLetterCount, int endLetterCount, IWebDriver driver, WordHintDbContext db, User adminUser, bool doContinueWithLastWord) { var alphabet = "abcdefghijklmnopqrstuvwxyzåæøö"; var permutations = alphabet.Select(x => x.ToString()); int permutationSize = 2; for (int i = 0; i < permutationSize - 1; i++) { permutations = permutations.SelectMany(x => alphabet, (x, y) => x + y); } var wordPermutationList = permutations.ToList(); wordPermutationList.Add("&"); wordPermutationList.Add("("); wordPermutationList.Add(")"); wordPermutationList.Add("+"); wordPermutationList.Add(","); wordPermutationList.Add("-"); wordPermutationList.Add("0"); wordPermutationList.Add("1"); wordPermutationList.Add("2"); wordPermutationList.Add("3"); wordPermutationList.Add("4"); wordPermutationList.Add("5"); wordPermutationList.Add("6"); wordPermutationList.Add("7"); wordPermutationList.Add("8"); wordPermutationList.Add("9"); // use the letter count a little bit different when it comes to the alphabetic index: // letterCount is the index to start with divided out on the total alphabetic index // e.g. // if letter count is between 1 - 4 of a total index length of 1000: // 1 is 1 // 2 is 250 // 3 is 500 // 4 is 750 int length = wordPermutationList.Count; int startIndex = (int)(((double)length / (double)endLetterCount) * (startLetterCount - 1)); int endIndex = (int)((((double)length / (double)endLetterCount) * startLetterCount) - 1); var startString = wordPermutationList[startIndex]; var endString = wordPermutationList[endIndex]; Log.Information("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString); writer.WriteLine("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString); // add some extra status information to the writer if (this.writer is SignalRClientWriter) { (this.writer as SignalRClientWriter).ExtraStatusInformation = string.Format("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString); } int curIndex = 0; foreach (var wordPermutation in wordPermutationList) { string wordPattern = wordPermutation.Length == 1 && wordPermutation[0] < 45 ? string.Format("%{0:X}", (int)wordPermutation[0]) : wordPermutation; curIndex++; if (curIndex < startIndex + 1) { Log.Information("Skipping pattern '{0}' until we reach index {1}: '{2}'. [{3}/{4}]", wordPattern, startIndex, startString, curIndex, length); writer.WriteLine("Skipping pattern '{0}' until we reach index {1}: '{2}'. [{3}/{4}]", wordPattern, startIndex, startString, curIndex, length); continue; } else if (length != curIndex && curIndex == endIndex + 1) // stop at last index except very last character { // reached the end - quit Log.Information("Quitting because we have reached the last index to process: {0} at index {1}.", wordPattern, curIndex); writer.WriteLine("Quitting because we have reached the last index to process: {0} at index {1}.", wordPattern, curIndex); break; } string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromComment(db, source, wordPattern); } // var href = $"https://www.gratiskryssord.no/kryssordbok/?kart={wordPattern}#oppslag"; var href = $"https://www.gratiskryssord.no/kryssordbok/alfabetisk/{wordPattern}/"; #if DEBUG // if (wordPermutation == "xå") // { // wordPattern = "kå"; // href = $"https://www.gratiskryssord.no/kryssordbok/?kart={wordPattern}#oppslag"; // lastWordString = WordDatabaseService.GetLastWordFromComment(db, source, wordPattern); // } // else if (wordPermutation == "&") // { // // debugging - break here // } wordPattern = "na"; href = $"https://www.gratiskryssord.no/kryssordbok/alfabetisk/{wordPattern}/"; lastWordString = "NAVN"; #endif ReadWordsByWordUrl(wordPattern, href, driver, db, adminUser, lastWordString); } }
private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url) { // there is a bug in the website that makes a query with "0" fail if (word.Value == "0") { return; } // open a new tab and set the context var chromeDriver = (ChromeDriver)driver; // save a reference to our original tab's window handle var originalTabInstance = chromeDriver.CurrentWindowHandle; // execute some JavaScript to open a new window chromeDriver.ExecuteScript("window.open();"); // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1]; // switch our WebDriver to the new tab's window handle chromeDriver.SwitchTo().Window(newTabInstance); // lets navigate to a web site in our new tab // https://www.gratiskryssord.no/kryssordbok/navn/side/1/ var page = 1; var pageUrl = $"{url}side/{page}/"; driver.Navigate().GoToUrl(pageUrl); while (true) { Log.Information("Processing synonym search for '{0}' on page {1}", word.Value, page); writer.WriteLine("Processing synonym search for '{0}' on page {1}", word.Value, page); // read the whole document into a HtmlNode HtmlNode doc = driver.GetDocumentNode(); // and parse synonyms using Agility Pack var relatedWords = ParseSynonymsAgilityPack(word, doc, adminUser); // and add to database WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer, false); // go to next page if exist var nextPageElement = FindNextPageOrNull(doc, word.Value.ToLower(), page + 1); if (nextPageElement != null) { var hintText = nextPageElement.InnerText.Trim().ToUpper(); hintText = HttpUtility.HtmlDecode(hintText); // ensure that text like & gets converted to & var href = nextPageElement.Attributes["href"].Value; string nextPageUrl = $"https://www.gratiskryssord.no{href}"; page++; driver.Navigate().GoToUrl(nextPageUrl); } else { break; } } // now lets close our new tab chromeDriver.ExecuteScript("window.close();"); // and switch our WebDriver back to the original tab's window handle chromeDriver.SwitchTo().Window(originalTabInstance); // and have our WebDriver focus on the main document in the page to send commands to chromeDriver.SwitchTo().DefaultContent(); }
public static int Main(string[] args) { Console.WriteLine("CrossWord ver. {0} ", "1.0"); string inputFile, outputFile, puzzle, dictionaryFile; if (!ParseInput(args, out inputFile, out outputFile, out puzzle, out dictionaryFile)) { return(1); } ICrossBoard board; try { if (inputFile.StartsWith("http")) { board = CrossBoardCreator.CreateFromUrl(inputFile); } else { board = CrossBoardCreator.CreateFromFile(inputFile); } } catch (Exception e) { Console.WriteLine(string.Format("Cannot load crossword layout from file {0}.", inputFile), e); return(2); } ICrossDictionary dictionary; try { if (dictionaryFile.Equals("database")) { dictionary = new DatabaseDictionary("server=localhost;port=3306;database=dictionary;user=user;password=password;charset=utf8;", board.MaxWordLength); } else { dictionary = new Dictionary(dictionaryFile, board.MaxWordLength); } } catch (Exception e) { Console.WriteLine(string.Format("Cannot load dictionary from file {0}.", dictionaryFile), e); return(3); } if (outputFile.Equals("signalr")) { // generate and send to signalr hub // var tokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(20)); var tokenSource = new CancellationTokenSource(); Task workerTask = Task.Run( async() => { CancellationToken token = tokenSource.Token; try { await Generator.GenerateCrosswordsAsync(board, dictionary, puzzle, token); } catch (OperationCanceledException) { Console.WriteLine("Cancelled @ {0}", DateTime.Now); } }); // wait until the task is done Task.WaitAll(workerTask); // or wait until the user presses a key // Console.WriteLine("Press Enter to Exit ..."); // Console.ReadLine(); // tokenSource.Cancel(); } else if (outputFile.Equals("database")) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext("server=localhost;database=dictionary;user=user;password=password;charset=utf8;", Log.Logger)) // null instead of Log.Logger enables debugging { // setup database // You would either call EnsureCreated() or Migrate(). // EnsureCreated() is an alternative that completely skips the migrations pipeline and just creates a database that matches you current model. // It's good for unit testing or very early prototyping, when you are happy just to delete and re-create the database when the model changes. // db.Database.EnsureDeleted(); // db.Database.EnsureCreated(); // Note! Therefore don't use EnsureDeleted() and EnsureCreated() but Migrate(); db.Database.Migrate(); // set admin user var user = new User() { FirstName = "", LastName = "Norwegian Synonyms json", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == user.FirstName).FirstOrDefault(); if (existingUser != null) { user = existingUser; } else { db.DictionaryUsers.Add(user); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this works when using the same user for all words. db.ChangeTracker.AutoDetectChangesEnabled = false; bool isDebugging = false; #if DEBUG isDebugging = true; #endif var source = "norwegian-synonyms.json"; if (Path.GetExtension(dictionaryFile).ToLower().Equals(".json")) { // read json files using (StreamReader r = new StreamReader(dictionaryFile)) { var json = r.ReadToEnd(); var jobj = JObject.Parse(json); var totalCount = jobj.Properties().Count(); int count = 0; foreach (var item in jobj.Properties()) { count++; var wordText = item.Name; var relatedArray = item.Values().Select(a => a.Value <string>()); WordDatabaseService.AddToDatabase(db, source, user, wordText, relatedArray); if (isDebugging) { // in debug mode the Console.Write \r isn't shown in the output console Console.WriteLine("[{0}] / [{1}]", count, totalCount); } else { Console.Write("\r[{0}] / [{1}]", count, totalCount); } } Console.WriteLine("Done!"); } } } } else { ICrossBoard resultBoard; try { resultBoard = puzzle != null ? GenerateFirstCrossWord(board, dictionary, puzzle) : GenerateFirstCrossWord(board, dictionary); } catch (Exception e) { Console.WriteLine("Generating crossword has failed.", e); return(4); } if (resultBoard == null) { Console.WriteLine(string.Format("No solution has been found.")); return(5); } try { SaveResultToFile(outputFile, resultBoard, dictionary); } catch (Exception e) { Console.WriteLine(string.Format("Saving result crossword to file {0} has failed.", outputFile), e); return(6); } } return(0); }
private void DoScrape(string siteUsername, string sitePassword, int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord, bool isScraperSwarm) { var dbContextFactory = new DesignTimeDbContextFactory(); using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger)) { // Note! // the user needs to be added before we disable tracking and disable AutoDetectChanges // otherwise this will crash // set admin user var adminUser = new User() { FirstName = "", LastName = "Admin", UserName = "******" }; // check if user already exists var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault(); if (existingUser != null) { adminUser = existingUser; } else { db.DictionaryUsers.Add(adminUser); db.SaveChanges(); } // disable tracking to speed things up // note that this doesn't load the virtual properties, but loads the object ids after a save db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking; // this doesn't seem to work when adding new users all the time db.ChangeTracker.AutoDetectChangesEnabled = false; #if DEBUG // some patterns give back a word with one less character than asked for - it seems the Ø is messing their system up // UTF8 two byte problem? // TROND?K????? gives TROND KJØLL // VEBJØRN?B???? gives VEBJØRN BERG // WILLY?R???????? gives WILLY RØGEBERG // THORBJØRN?H??????? gives THORBJØRN HÅRSTAD // lastWordString = "TRONSMOS VEG"; // word before TROND KJØLL // letterCount = 12; // lastWordString = "ÅSTED FOR DRAMAET ROMEO OG JULIE"; // letterCount = 32; // lastWordString = "GUTTENAVN PÅ \"A\""; // letterCount = 16; // endLetterCount = 17; // lastWordString = "TALL SOM ANGIR FORHOLDET MELLOM ET LEGEMES HASTIGHET OG LYDENS"; // lastWordString = "ÅPNINGSKONSERTSTYKKE"; // letterCount = lastWordString.Length; // endLetterCount = 300; #endif using (var driver = ChromeDriverUtils.GetChromeDriver(true)) { DoLogon(driver, siteUsername, sitePassword); for (int i = startLetterCount; i < endLetterCount; i++) { // reset global variables hasFoundPattern = false; // this is the first stage, we match the pattern hasFoundLastWord = false; // this is the second stage, we not only match the pattern but the word as well hasMissedLastWord = false; string lastWordString = null; if (doContinueWithLastWord) { lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, i); } // don't skip any words when the last word is empty if (lastWordString == null) { hasFoundLastWord = true; } // added break to support several docker instances scraping in swarms if (isScraperSwarm && (i > startLetterCount)) { Log.Error("Warning! Quitting since the current letter length > letter count: {0} / {1}", i, startLetterCount); break; } ReadWordsByWordPermutations(i, driver, db, adminUser, lastWordString); } } } }
private void ProcessWordsUntilEmpty(WordPattern wordPattern, IWebDriver driver, WordHintDbContext db, User adminUser, int page, HtmlNode documentNode, string url) { while (true) { Log.Information("Processing pattern search for '{0}' on page {1}", wordPattern.Pattern, page + 1); writer.WriteLine("Processing pattern search for '{0}' on page {1}", wordPattern.Pattern, page + 1); // parse all words var words = ReadWordsAgilityPack(documentNode, adminUser); foreach (var word in words) { if (wordPattern.IsMatchLastWord) { Log.Information("The current pattern matches the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value); hasFoundPattern = true; var wordRemoveDiacriticsToNorwegian = word.Value.RemoveDiacriticsToNorwegian(); // we might have had to add question marks at the end of the string to fix the length bug at the site if (wordRemoveDiacriticsToNorwegian == wordPattern.LastWord.TrimEnd('?')) { Log.Information("The current word matches the last-word: {0} = {1}", word.Value, wordPattern.LastWord); hasFoundLastWord = true; } } else { if (!hasFoundLastWord && hasFoundPattern) { // if the pattern not any longer match, we never found the word - has it been deleted? Log.Error("Warning! The current pattern does not any longer match the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value); writer.WriteLine("Warning! The current pattern does not any longer match the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value); hasMissedLastWord = true; return; } } if (hasFoundLastWord) { string currentValue = word.Value; // check if this is one of the buggy words from their site where the words found don't have the same length as the pattern says it should have if (wordPattern.Length != word.Value.Length) { Log.Error("Warning! The current word doesn't match the length of the query pattern: {0} = {1}", word.Value, wordPattern.Pattern); writer.WriteLine("Warning! The current word doesn't match the length of the query pattern: {0} = {1}", word.Value, wordPattern.Pattern); if (wordPattern.Length > word.Value.Length) { currentValue = currentValue + new string('?', wordPattern.Length - word.Value.Length); } else { currentValue = currentValue.Substring(0, wordPattern.Length); } } else { // everything is OK } // update that we are processing this word, ignore length and comment WordDatabaseService.UpdateState(db, source, new Word() { Value = currentValue.ToUpper(), Source = source, CreatedDate = DateTime.Now }, writer); GetWordSynonyms(word, driver, db, adminUser); } } // go to next page if exist var(hasFoundNextPage, pageNumber, pageUrl, pageNode) = NavigateToNextPageIfExist(driver, documentNode); if (hasFoundNextPage) { url = pageUrl; page = pageNumber; documentNode = pageNode; } else { break; } } }
private void ReadWordsIntoDatabase(WordHintDbContext db) { #if DEBUG // Create new stopwatch. Stopwatch stopwatch = new Stopwatch(); // Begin timing. stopwatch.Start(); #endif // var wordIdsToExclude = WordDatabaseService.GetWordIdList(db, new List<string> { "BY", "NAVN", "ELV", "FJELL", "FORKORTELSE", "IATA-FLYPLASSKODE", "ISO-KODE" }); var wordIdsToExclude = WordDatabaseService.GetWordIdList(db, new List <string> { "BY", "NAVN" }); // search for all words var words = db.Words .Where((w => (w.NumberOfWords == 1) && (w.NumberOfLetters <= _maxWordLength) && !wordIdsToExclude.Contains(w.WordId))) .OrderBy(w => w.Value) .Select(w => w.Value) .AsNoTracking(); // search for all words // var words = _db.Words // .Where((w => (w.NumberOfWords == 1) && (w.NumberOfLetters <= _maxWordLength))) // .OrderBy(w => w.Value) // .Select(w => w.Value) // .AsNoTracking(); // in order to sort with Collation we need to use raw SQL // var words = _db.Words.FromSql( // $"SELECT w.Value FROM Words AS w WHERE w.NumberOfWords = 1 AND w.NumberOfLetters <= {_maxWordLength} ORDER BY w.Value COLLATE utf8mb4_da_0900_as_cs") // .Select(w => w.Value) // .AsNoTracking(); foreach (var word in words) { string wordText = word; if (wordText.All(char.IsLetter)) // if (wordText.All(x => char.IsLetter(x) || x == '-' || x == ' ')) { AddWord(wordText); } } // using ADO.NET seems faster than ef core for raw SQLs // using (var command = _db.Database.GetDbConnection().CreateCommand()) // { // command.CommandText = $"SELECT w.Value FROM Words AS w WHERE w.NumberOfWords = 1 AND w.NumberOfLetters <= {_maxWordLength} ORDER BY w.Value COLLATE utf8mb4_da_0900_as_cs"; // db.Database.OpenConnection(); // using (var reader = command.ExecuteReader()) // { // while (reader.Read()) // { // string wordText = reader[0].ToString(); // if (wordText.All(char.IsLetter)) // // if (wordText.All(x => char.IsLetter(x) || x == '-' || x == ' ')) // { // AddWord(wordText); // } // } // } // } #if DEBUG // Stop timing. stopwatch.Stop(); // Write result. if (_logger != null) { _logger.LogDebug("ReadWordsIntoDatabase - Time elapsed: {0}", stopwatch.Elapsed); } else { Console.WriteLine("ReadWordsIntoDatabase - Time elapsed: {0}", stopwatch.Elapsed); } #endif }
private void ReadWordsFromUrl(WordHintDbContext db, User adminUser, string lastWord) { using (WebClient client = new WebClient()) using (Stream stream = client.OpenRead(JSON_URL)) using (StreamReader streamReader = new StreamReader(stream)) using (JsonTextReader reader = new JsonTextReader(streamReader)) { reader.SupportMultipleContent = true; string currentValue = null; List <string> currentList = null; int totalCount = 25000; int count = 0; bool hasFound = false; var serializer = new JsonSerializer(); while (reader.Read()) { // output the stream one chunk at a time // Log.Information(string.Format("{0,-12} {1}", // reader.TokenType.ToString(), // reader.Value != null ? reader.Value.ToString() : "(null)")); switch (reader.TokenType) { // JsonToken.StartObject = deserialize only when there's "{" character in the stream case JsonToken.StartObject: break; // JsonToken.PropertyName = deserialize only when there's a "text": in the stream case JsonToken.PropertyName: currentValue = reader.Value.ToString(); break; // JsonToken.String = deserialize only when there's a "text" in the stream case JsonToken.String: currentList.Add(reader.Value.ToString()); break; // JsonToken.StartArray = deserialize only when there's "[" character in the stream case JsonToken.StartArray: currentList = new List <string>(); break; // JsonToken.EndArray = deserialize only when there's "]" character in the stream case JsonToken.EndArray: count++; // skip until we reach last word beginning if (lastWord != null) { if (currentValue.ToUpperInvariant().Equals(lastWord)) { hasFound = true; } } else { hasFound = true; } // store to database if (hasFound) { // update that we are processing this word, ignore length and comment WordDatabaseService.UpdateState(db, source, new Word() { Value = currentValue.ToUpper(), Source = source, CreatedDate = DateTime.Now }, writer, true); // disable storing state since we are doing it manually above WordDatabaseService.AddToDatabase(db, source, adminUser, currentValue, currentList, writer, false); // if (writer != null) writer.WriteLine("Added '{0} => {1}'", currentValue, string.Join(",", currentList)); if ((count % 10) == 0) { if (writer != null) { writer.WriteLine("[{0}] / [{1}]", count, totalCount); } } } // and reset currentList = null; currentValue = null; break; // JsonToken.EndObject = deserialize only when there's "}" character in the stream case JsonToken.EndObject: currentList = null; currentValue = null; break; } } } /* * // reading the whole thing took approx the same time as the streaming version * { * var json = streamReader.ReadToEnd(); * var jobj = JObject.Parse(json); * * var totalCount = jobj.Properties().Count(); * int count = 0; * foreach (var item in jobj.Properties()) * { * count++; * * var currentValue = item.Name; * var currentList = item.Values().Select(a => a.Value<string>()); * * WordDatabaseService.AddToDatabase(db, source, adminUser, currentValue, currentList); * * // if (writer != null) writer.WriteLine("Added '{0} => {1}'", currentValue, string.Join(",", currentList)); * if (writer != null) writer.WriteLine("[{0}] / [{1}]", count, totalCount); * } * } */ }