示例#1
0
        private void ProcessSynonymsUntilEmpty(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, int page, HtmlNode documentNode, string url)
        {
            while (true)
            {
                Log.Information("Processing synonym search for '{0}' on page {1}", word.Value, page + 1);
                writer.WriteLine("Processing synonym search for '{0}' on page {1}", word.Value, page + 1);

                // parse all related words
                var relatedWords = ReadRelatedWordsAgilityPack(documentNode, adminUser);

                // and add to database
                // don't update state
                WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer, false);

                // go to next page if exist
                // Note! this only works if we are logged in
                var(hasFoundNextPage, pageNumber, pageUrl, pageNode) = NavigateToNextPageIfExist(driver, documentNode);
                if (hasFoundNextPage)
                {
                    url          = pageUrl;
                    page         = pageNumber;
                    documentNode = pageNode;
                }
                else
                {
                    break;
                }
            }
        }
        private void ReadWordsByWordUrl(string wordPrefix, string url, IWebDriver driver, WordHintDbContext db, User adminUser, string lastWord)
        {
            // go to word page
            try
            {
                driver.Navigate().GoToUrl(url);
            }
            catch (System.Exception)
            {
                // Log.Error("Timeout navigating to '{0}'", url);
                writer.WriteLine("Timeout navigating to '{0}'", url);
                return;
            }

            Log.Information("Processing word search for '{0}'", wordPrefix);
            writer.WriteLine("Processing word search for '{0}'", wordPrefix);

            // read the whole document into a HtmlNode
            HtmlNode doc = driver.GetDocumentNode();

            // and parse using agility pack
            var words = ParseWordsAgilityPack(doc, adminUser);

            bool doSkip = true;

            foreach (var wordAndHref in words)
            {
                var word     = wordAndHref.Item1;
                var href     = wordAndHref.Item2;
                var wordText = word.Value;

                // skip until we get to the last word
                if (doSkip && lastWord != null && lastWord != wordText)
                {
                    Log.Information("Skipping alphabetic word '{0}' until we find '{1}'", wordText, lastWord);
                    writer.WriteLine("Skipping alphabetic word '{0}' until we find '{1}'", wordText, lastWord);
                    continue;
                }
                doSkip = false; // make sure we don't skip on the next word after we have skipped

                // update that we are processing this word
                WordDatabaseService.UpdateState(db, source, new Word()
                {
                    Value = wordText, Comment = wordPrefix, CreatedDate = DateTime.Now
                }, writer, true);

                GetWordSynonyms(word, driver, db, adminUser, href);
            }
        }
示例#3
0
        private void DoScrape(string source, bool doContinueWithLastWord)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                string lastWordString = null;
                if (doContinueWithLastWord)
                {
                    lastWordString = WordDatabaseService.GetLastWordFromSource(db, source);
                }

                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

                ReadWordsFromUrl(db, adminUser, lastWordString);
            }
        }
示例#4
0
        private void DoScrape(int letterCount, string source, bool doContinueWithLastWord)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                string lastWordString = null;
                if (doContinueWithLastWord)
                {
                    lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, letterCount);
                }

                // if we didn't get back a word, use a pattern instead
                if (lastWordString == null)
                {
                    switch (letterCount)
                    {
                    case 1:
                        lastWordString = "a";
                        break;

                    case 2:
                        lastWordString = "aa";
                        break;

                    default:
                        lastWordString = "aa" + new string('?', letterCount - 2);
                        break;
                    }

                    Log.Information("Could not find any words having '{0}' letters. Therefore using last word pattern '{1}'.", letterCount, lastWordString);
                }

                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    // read all words with the letter count
                    ReadWordsByWordPermutations(letterCount, driver, db, adminUser, lastWordString);
                }
            }
        }
示例#5
0
        private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url)
        {
            // there is a bug in the website that makes a  query with "0" fail
            if (word.Value == "0")
            {
                return;
            }

            // open a new tab and set the context
            var chromeDriver = (ChromeDriver)driver;

            // save a reference to our original tab's window handle
            var originalTabInstance = chromeDriver.CurrentWindowHandle;

            // execute some JavaScript to open a new window
            chromeDriver.ExecuteScript("window.open();");

            // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection
            var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1];

            // switch our WebDriver to the new tab's window handle
            chromeDriver.SwitchTo().Window(newTabInstance);

            // lets navigate to a web site in our new tab
            driver.Navigate().GoToUrl(url);

            Log.Information("Processing synonym search for '{0}'", word.Value);
            writer.WriteLine("Processing synonym search for '{0}'", word.Value);

            // parse all synonyms
            IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li"));
            IWebElement         ahref        = null;

            var relatedWords = new List <Word>();

            foreach (IWebElement listElement in listElements)
            {
                try
                {
                    ahref = listElement.FindElement(By.TagName("a"));
                }
                catch (NoSuchElementException)
                {
                    break;
                }

                var hintText = ahref.Text;
                var href     = ahref.GetAttribute("href");

                var hint = new Word
                {
                    Language        = "no",
                    Value           = hintText,
                    NumberOfLetters = hintText.Count(c => c != ' '),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(hintText),
                    User            = adminUser,
                    CreatedDate     = DateTime.Now,
                    Source          = this.source
                };

                relatedWords.Add(hint);
            }

            relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word>

            // and add to database
            WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer);

            // now lets close our new tab
            chromeDriver.ExecuteScript("window.close();");

            // and switch our WebDriver back to the original tab's window handle
            chromeDriver.SwitchTo().Window(originalTabInstance);

            // and have our WebDriver focus on the main document in the page to send commands to
            chromeDriver.SwitchTo().DefaultContent();
        }
        private void ReadWordsByWordPermutations(int startLetterCount, int endLetterCount, IWebDriver driver, WordHintDbContext db, User adminUser, bool doContinueWithLastWord)
        {
            var alphabet        = "abcdefghijklmnopqrstuvwxyzåæøö";
            var permutations    = alphabet.Select(x => x.ToString());
            int permutationSize = 2;

            for (int i = 0; i < permutationSize - 1; i++)
            {
                permutations = permutations.SelectMany(x => alphabet, (x, y) => x + y);
            }

            var wordPermutationList = permutations.ToList();

            wordPermutationList.Add("&");
            wordPermutationList.Add("(");
            wordPermutationList.Add(")");
            wordPermutationList.Add("+");
            wordPermutationList.Add(",");
            wordPermutationList.Add("-");
            wordPermutationList.Add("0");
            wordPermutationList.Add("1");
            wordPermutationList.Add("2");
            wordPermutationList.Add("3");
            wordPermutationList.Add("4");
            wordPermutationList.Add("5");
            wordPermutationList.Add("6");
            wordPermutationList.Add("7");
            wordPermutationList.Add("8");
            wordPermutationList.Add("9");

            // use the letter count a little bit different when it comes to the alphabetic index:
            // letterCount is the index to start with divided out on the total alphabetic index
            // e.g.
            // if letter count is between 1 - 4 of a total index length of 1000:
            // 1 is 1
            // 2 is 250
            // 3 is 500
            // 4 is 750
            int length      = wordPermutationList.Count;
            int startIndex  = (int)(((double)length / (double)endLetterCount) * (startLetterCount - 1));
            int endIndex    = (int)((((double)length / (double)endLetterCount) * startLetterCount) - 1);
            var startString = wordPermutationList[startIndex];
            var endString   = wordPermutationList[endIndex];

            Log.Information("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString);
            writer.WriteLine("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString);

            // add some extra status information to the writer
            if (this.writer is SignalRClientWriter)
            {
                (this.writer as SignalRClientWriter).ExtraStatusInformation = string.Format("Processing alphabetic permutation search using {0}-{1} = {2}-{3} ({4} - {5}) ", startLetterCount, endLetterCount, startIndex, endIndex, startString, endString);
            }

            int curIndex = 0;

            foreach (var wordPermutation in wordPermutationList)
            {
                string wordPattern = wordPermutation.Length == 1 && wordPermutation[0] < 45 ? string.Format("%{0:X}", (int)wordPermutation[0]) : wordPermutation;
                curIndex++;

                if (curIndex < startIndex + 1)
                {
                    Log.Information("Skipping pattern '{0}' until we reach index {1}: '{2}'. [{3}/{4}]", wordPattern, startIndex, startString, curIndex, length);
                    writer.WriteLine("Skipping pattern '{0}' until we reach index {1}: '{2}'. [{3}/{4}]", wordPattern, startIndex, startString, curIndex, length);
                    continue;
                }
                else if (length != curIndex && curIndex == endIndex + 1) // stop at last index except very last character
                {
                    // reached the end - quit
                    Log.Information("Quitting because we have reached the last index to process: {0} at index {1}.", wordPattern, curIndex);
                    writer.WriteLine("Quitting because we have reached the last index to process: {0} at index {1}.", wordPattern, curIndex);
                    break;
                }

                string lastWordString = null;
                if (doContinueWithLastWord)
                {
                    lastWordString = WordDatabaseService.GetLastWordFromComment(db, source, wordPattern);
                }

                // var href = $"https://www.gratiskryssord.no/kryssordbok/?kart={wordPattern}#oppslag";
                var href = $"https://www.gratiskryssord.no/kryssordbok/alfabetisk/{wordPattern}/";
#if DEBUG
                // if (wordPermutation == "xå")
                // {
                //     wordPattern = "kå";
                //     href = $"https://www.gratiskryssord.no/kryssordbok/?kart={wordPattern}#oppslag";
                //     lastWordString = WordDatabaseService.GetLastWordFromComment(db, source, wordPattern);
                // }
                // else if (wordPermutation == "&")
                // {
                //     // debugging - break here
                // }

                wordPattern    = "na";
                href           = $"https://www.gratiskryssord.no/kryssordbok/alfabetisk/{wordPattern}/";
                lastWordString = "NAVN";
#endif
                ReadWordsByWordUrl(wordPattern, href, driver, db, adminUser, lastWordString);
            }
        }
        private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url)
        {
            // there is a bug in the website that makes a  query with "0" fail
            if (word.Value == "0")
            {
                return;
            }

            // open a new tab and set the context
            var chromeDriver = (ChromeDriver)driver;

            // save a reference to our original tab's window handle
            var originalTabInstance = chromeDriver.CurrentWindowHandle;

            // execute some JavaScript to open a new window
            chromeDriver.ExecuteScript("window.open();");

            // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection
            var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1];

            // switch our WebDriver to the new tab's window handle
            chromeDriver.SwitchTo().Window(newTabInstance);

            // lets navigate to a web site in our new tab
            // https://www.gratiskryssord.no/kryssordbok/navn/side/1/
            var page    = 1;
            var pageUrl = $"{url}side/{page}/";

            driver.Navigate().GoToUrl(pageUrl);
            while (true)
            {
                Log.Information("Processing synonym search for '{0}' on page {1}", word.Value, page);
                writer.WriteLine("Processing synonym search for '{0}' on page {1}", word.Value, page);

                // read the whole document into a HtmlNode
                HtmlNode doc = driver.GetDocumentNode();

                // and parse synonyms using Agility Pack
                var relatedWords = ParseSynonymsAgilityPack(word, doc, adminUser);

                // and add to database
                WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer, false);

                // go to next page if exist
                var nextPageElement = FindNextPageOrNull(doc, word.Value.ToLower(), page + 1);
                if (nextPageElement != null)
                {
                    var hintText = nextPageElement.InnerText.Trim().ToUpper();
                    hintText = HttpUtility.HtmlDecode(hintText); // ensure that text like &amp; gets converted to &
                    var    href        = nextPageElement.Attributes["href"].Value;
                    string nextPageUrl = $"https://www.gratiskryssord.no{href}";

                    page++;
                    driver.Navigate().GoToUrl(nextPageUrl);
                }
                else
                {
                    break;
                }
            }

            // now lets close our new tab
            chromeDriver.ExecuteScript("window.close();");

            // and switch our WebDriver back to the original tab's window handle
            chromeDriver.SwitchTo().Window(originalTabInstance);

            // and have our WebDriver focus on the main document in the page to send commands to
            chromeDriver.SwitchTo().DefaultContent();
        }
示例#8
0
        public static int Main(string[] args)
        {
            Console.WriteLine("CrossWord ver. {0} ", "1.0");

            string inputFile, outputFile, puzzle, dictionaryFile;

            if (!ParseInput(args, out inputFile, out outputFile, out puzzle, out dictionaryFile))
            {
                return(1);
            }
            ICrossBoard board;

            try
            {
                if (inputFile.StartsWith("http"))
                {
                    board = CrossBoardCreator.CreateFromUrl(inputFile);
                }
                else
                {
                    board = CrossBoardCreator.CreateFromFile(inputFile);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(string.Format("Cannot load crossword layout from file {0}.", inputFile), e);
                return(2);
            }

            ICrossDictionary dictionary;

            try
            {
                if (dictionaryFile.Equals("database"))
                {
                    dictionary = new DatabaseDictionary("server=localhost;port=3306;database=dictionary;user=user;password=password;charset=utf8;", board.MaxWordLength);
                }
                else
                {
                    dictionary = new Dictionary(dictionaryFile, board.MaxWordLength);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(string.Format("Cannot load dictionary from file {0}.", dictionaryFile), e);
                return(3);
            }

            if (outputFile.Equals("signalr"))
            {
                // generate and send to signalr hub
                // var tokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(20));
                var  tokenSource = new CancellationTokenSource();
                Task workerTask  = Task.Run(
                    async() =>
                {
                    CancellationToken token = tokenSource.Token;
                    try
                    {
                        await Generator.GenerateCrosswordsAsync(board, dictionary, puzzle, token);
                    }
                    catch (OperationCanceledException)
                    {
                        Console.WriteLine("Cancelled @ {0}", DateTime.Now);
                    }
                });

                // wait until the task is done
                Task.WaitAll(workerTask);

                // or wait until the user presses a key
                // Console.WriteLine("Press Enter to Exit ...");
                // Console.ReadLine();
                // tokenSource.Cancel();
            }
            else if (outputFile.Equals("database"))
            {
                var dbContextFactory = new DesignTimeDbContextFactory();
                using (var db = dbContextFactory.CreateDbContext("server=localhost;database=dictionary;user=user;password=password;charset=utf8;", Log.Logger)) // null instead of Log.Logger enables debugging
                {
                    // setup database
                    // You would either call EnsureCreated() or Migrate().
                    // EnsureCreated() is an alternative that completely skips the migrations pipeline and just creates a database that matches you current model.
                    // It's good for unit testing or very early prototyping, when you are happy just to delete and re-create the database when the model changes.
                    // db.Database.EnsureDeleted();
                    // db.Database.EnsureCreated();

                    // Note! Therefore don't use EnsureDeleted() and EnsureCreated() but Migrate();
                    db.Database.Migrate();

                    // set admin user
                    var user = new User()
                    {
                        FirstName = "",
                        LastName  = "Norwegian Synonyms json",
                        UserName  = "******"
                    };

                    // check if user already exists
                    var existingUser = db.DictionaryUsers.Where(u => u.FirstName == user.FirstName).FirstOrDefault();
                    if (existingUser != null)
                    {
                        user = existingUser;
                    }
                    else
                    {
                        db.DictionaryUsers.Add(user);
                        db.SaveChanges();
                    }

                    // disable tracking to speed things up
                    // note that this doesn't load the virtual properties, but loads the object ids after a save
                    db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                    // this works when using the same user for all words.
                    db.ChangeTracker.AutoDetectChangesEnabled = false;

                    bool isDebugging = false;
#if DEBUG
                    isDebugging = true;
#endif

                    var source = "norwegian-synonyms.json";
                    if (Path.GetExtension(dictionaryFile).ToLower().Equals(".json"))
                    {
                        // read json files
                        using (StreamReader r = new StreamReader(dictionaryFile))
                        {
                            var json = r.ReadToEnd();
                            var jobj = JObject.Parse(json);

                            var totalCount = jobj.Properties().Count();
                            int count      = 0;
                            foreach (var item in jobj.Properties())
                            {
                                count++;

                                var wordText     = item.Name;
                                var relatedArray = item.Values().Select(a => a.Value <string>());

                                WordDatabaseService.AddToDatabase(db, source, user, wordText, relatedArray);

                                if (isDebugging)
                                {
                                    // in debug mode the Console.Write \r isn't shown in the output console
                                    Console.WriteLine("[{0}] / [{1}]", count, totalCount);
                                }
                                else
                                {
                                    Console.Write("\r[{0}] / [{1}]", count, totalCount);
                                }
                            }
                            Console.WriteLine("Done!");
                        }
                    }
                }
            }
            else
            {
                ICrossBoard resultBoard;
                try
                {
                    resultBoard = puzzle != null
                        ? GenerateFirstCrossWord(board, dictionary, puzzle)
                        : GenerateFirstCrossWord(board, dictionary);
                }
                catch (Exception e)
                {
                    Console.WriteLine("Generating crossword has failed.", e);
                    return(4);
                }
                if (resultBoard == null)
                {
                    Console.WriteLine(string.Format("No solution has been found."));
                    return(5);
                }
                try
                {
                    SaveResultToFile(outputFile, resultBoard, dictionary);
                }
                catch (Exception e)
                {
                    Console.WriteLine(string.Format("Saving result crossword to file {0} has failed.", outputFile), e);
                    return(6);
                }
            }
            return(0);
        }
示例#9
0
        private void DoScrape(string siteUsername, string sitePassword, int startLetterCount, int endLetterCount, string source, bool doContinueWithLastWord, bool isScraperSwarm)
        {
            var dbContextFactory = new DesignTimeDbContextFactory();

            using (var db = dbContextFactory.CreateDbContext(connectionString, Log.Logger))
            {
                // Note!
                // the user needs to be added before we disable tracking and disable AutoDetectChanges
                // otherwise this will crash

                // set admin user
                var adminUser = new User()
                {
                    FirstName = "",
                    LastName  = "Admin",
                    UserName  = "******"
                };

                // check if user already exists
                var existingUser = db.DictionaryUsers.Where(u => u.FirstName == adminUser.FirstName).FirstOrDefault();
                if (existingUser != null)
                {
                    adminUser = existingUser;
                }
                else
                {
                    db.DictionaryUsers.Add(adminUser);
                    db.SaveChanges();
                }

                // disable tracking to speed things up
                // note that this doesn't load the virtual properties, but loads the object ids after a save
                db.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

                // this doesn't seem to work when adding new users all the time
                db.ChangeTracker.AutoDetectChangesEnabled = false;

#if DEBUG
                // some patterns give back a word with one less character than asked for - it seems the Ø is messing their system up
                // UTF8 two byte problem?
                // TROND?K?????         gives TROND KJØLL
                // VEBJØRN?B????        gives VEBJØRN BERG
                // WILLY?R????????      gives WILLY RØGEBERG
                // THORBJØRN?H???????   gives THORBJØRN HÅRSTAD

                // lastWordString = "TRONSMOS VEG"; // word before TROND KJØLL
                // letterCount = 12;

                // lastWordString = "ÅSTED FOR DRAMAET ROMEO OG JULIE";
                // letterCount = 32;

                // lastWordString = "GUTTENAVN PÅ \"A\"";
                // letterCount = 16;
                // endLetterCount = 17;

                // lastWordString = "TALL SOM ANGIR FORHOLDET MELLOM ET LEGEMES HASTIGHET OG LYDENS";
                // lastWordString = "ÅPNINGSKONSERTSTYKKE";
                // letterCount = lastWordString.Length;
                // endLetterCount = 300;
#endif


                using (var driver = ChromeDriverUtils.GetChromeDriver(true))
                {
                    DoLogon(driver, siteUsername, sitePassword);

                    for (int i = startLetterCount; i < endLetterCount; i++)
                    {
                        // reset global variables
                        hasFoundPattern   = false; // this is the first stage, we match the pattern
                        hasFoundLastWord  = false; // this is the second stage, we not only match the pattern but the word as well
                        hasMissedLastWord = false;

                        string lastWordString = null;
                        if (doContinueWithLastWord)
                        {
                            lastWordString = WordDatabaseService.GetLastWordFromLetterCount(db, source, i);
                        }

                        // don't skip any words when the last word is empty
                        if (lastWordString == null)
                        {
                            hasFoundLastWord = true;
                        }

                        // added break to support several docker instances scraping in swarms
                        if (isScraperSwarm && (i > startLetterCount))
                        {
                            Log.Error("Warning! Quitting since the current letter length > letter count: {0} / {1}", i, startLetterCount);
                            break;
                        }

                        ReadWordsByWordPermutations(i, driver, db, adminUser, lastWordString);
                    }
                }
            }
        }
示例#10
0
        private void ProcessWordsUntilEmpty(WordPattern wordPattern, IWebDriver driver, WordHintDbContext db, User adminUser, int page, HtmlNode documentNode, string url)
        {
            while (true)
            {
                Log.Information("Processing pattern search for '{0}' on page {1}", wordPattern.Pattern, page + 1);
                writer.WriteLine("Processing pattern search for '{0}' on page {1}", wordPattern.Pattern, page + 1);

                // parse all words
                var words = ReadWordsAgilityPack(documentNode, adminUser);

                foreach (var word in words)
                {
                    if (wordPattern.IsMatchLastWord)
                    {
                        Log.Information("The current pattern matches the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value);
                        hasFoundPattern = true;

                        var wordRemoveDiacriticsToNorwegian = word.Value.RemoveDiacriticsToNorwegian();

                        // we might have had to add question marks at the end of the string to fix the length bug at the site
                        if (wordRemoveDiacriticsToNorwegian == wordPattern.LastWord.TrimEnd('?'))
                        {
                            Log.Information("The current word matches the last-word: {0} = {1}", word.Value, wordPattern.LastWord);
                            hasFoundLastWord = true;
                        }
                    }
                    else
                    {
                        if (!hasFoundLastWord && hasFoundPattern)
                        {
                            // if the pattern not any longer match, we never found the word - has it been deleted?
                            Log.Error("Warning! The current pattern does not any longer match the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value);
                            writer.WriteLine("Warning! The current pattern does not any longer match the last-word: {0} = {1}. Current word: {2}", wordPattern.Pattern, wordPattern.LastWord, word.Value);
                            hasMissedLastWord = true;
                            return;
                        }
                    }

                    if (hasFoundLastWord)
                    {
                        string currentValue = word.Value;

                        // check if this is one of the buggy words from their site where the words found don't have the same length as the pattern says it should have
                        if (wordPattern.Length != word.Value.Length)
                        {
                            Log.Error("Warning! The current word doesn't match the length of the query pattern: {0} = {1}", word.Value, wordPattern.Pattern);
                            writer.WriteLine("Warning! The current word doesn't match the length of the query pattern: {0} = {1}", word.Value, wordPattern.Pattern);

                            if (wordPattern.Length > word.Value.Length)
                            {
                                currentValue = currentValue + new string('?', wordPattern.Length - word.Value.Length);
                            }
                            else
                            {
                                currentValue = currentValue.Substring(0, wordPattern.Length);
                            }
                        }
                        else
                        {
                            // everything is OK
                        }

                        // update that we are processing this word, ignore length and comment
                        WordDatabaseService.UpdateState(db, source, new Word()
                        {
                            Value = currentValue.ToUpper(), Source = source, CreatedDate = DateTime.Now
                        }, writer);

                        GetWordSynonyms(word, driver, db, adminUser);
                    }
                }

                // go to next page if exist
                var(hasFoundNextPage, pageNumber, pageUrl, pageNode) = NavigateToNextPageIfExist(driver, documentNode);
                if (hasFoundNextPage)
                {
                    url          = pageUrl;
                    page         = pageNumber;
                    documentNode = pageNode;
                }
                else
                {
                    break;
                }
            }
        }
示例#11
0
        private void ReadWordsIntoDatabase(WordHintDbContext db)
        {
#if DEBUG
            // Create new stopwatch.
            Stopwatch stopwatch = new Stopwatch();

            // Begin timing.
            stopwatch.Start();
#endif

            // var wordIdsToExclude = WordDatabaseService.GetWordIdList(db, new List<string> { "BY", "NAVN", "ELV", "FJELL", "FORKORTELSE", "IATA-FLYPLASSKODE", "ISO-KODE" });
            var wordIdsToExclude = WordDatabaseService.GetWordIdList(db, new List <string> {
                "BY", "NAVN"
            });

            // search for all words
            var words = db.Words
                        .Where((w => (w.NumberOfWords == 1) && (w.NumberOfLetters <= _maxWordLength) && !wordIdsToExclude.Contains(w.WordId)))
                        .OrderBy(w => w.Value)
                        .Select(w => w.Value)
                        .AsNoTracking();

            // search for all words
            // var words = _db.Words
            //     .Where((w => (w.NumberOfWords == 1) && (w.NumberOfLetters <= _maxWordLength)))
            //     .OrderBy(w => w.Value)
            //     .Select(w => w.Value)
            //     .AsNoTracking();

            // in order to sort with Collation we need to use raw SQL
            // var words = _db.Words.FromSql(
            //     $"SELECT w.Value FROM Words AS w WHERE w.NumberOfWords = 1 AND w.NumberOfLetters <= {_maxWordLength} ORDER BY w.Value COLLATE utf8mb4_da_0900_as_cs")
            //     .Select(w => w.Value)
            //     .AsNoTracking();

            foreach (var word in words)
            {
                string wordText = word;
                if (wordText.All(char.IsLetter))
                // if (wordText.All(x => char.IsLetter(x) || x == '-' || x == ' '))
                {
                    AddWord(wordText);
                }
            }

            // using ADO.NET seems faster than ef core for raw SQLs
            // using (var command = _db.Database.GetDbConnection().CreateCommand())
            // {
            //     command.CommandText = $"SELECT w.Value FROM Words AS w WHERE w.NumberOfWords = 1 AND w.NumberOfLetters <= {_maxWordLength} ORDER BY w.Value COLLATE utf8mb4_da_0900_as_cs";
            //     db.Database.OpenConnection();
            //     using (var reader = command.ExecuteReader())
            //     {
            //         while (reader.Read())
            //         {
            //             string wordText = reader[0].ToString();
            //             if (wordText.All(char.IsLetter))
            //             // if (wordText.All(x => char.IsLetter(x) || x == '-' || x == ' '))
            //             {
            //                 AddWord(wordText);
            //             }
            //         }
            //     }
            // }

#if DEBUG
            // Stop timing.
            stopwatch.Stop();

            // Write result.
            if (_logger != null)
            {
                _logger.LogDebug("ReadWordsIntoDatabase - Time elapsed: {0}", stopwatch.Elapsed);
            }
            else
            {
                Console.WriteLine("ReadWordsIntoDatabase - Time elapsed: {0}", stopwatch.Elapsed);
            }
#endif
        }
示例#12
0
        private void ReadWordsFromUrl(WordHintDbContext db, User adminUser, string lastWord)
        {
            using (WebClient client = new WebClient())
                using (Stream stream = client.OpenRead(JSON_URL))
                    using (StreamReader streamReader = new StreamReader(stream))

                        using (JsonTextReader reader = new JsonTextReader(streamReader))
                        {
                            reader.SupportMultipleContent = true;

                            string        currentValue = null;
                            List <string> currentList  = null;
                            int           totalCount   = 25000;
                            int           count        = 0;

                            bool hasFound = false;

                            var serializer = new JsonSerializer();
                            while (reader.Read())
                            {
                                // output the stream one chunk at a time
                                // Log.Information(string.Format("{0,-12}  {1}",
                                //         reader.TokenType.ToString(),
                                //         reader.Value != null ? reader.Value.ToString() : "(null)"));

                                switch (reader.TokenType)
                                {
                                // JsonToken.StartObject = deserialize only when there's "{" character in the stream
                                case JsonToken.StartObject:
                                    break;

                                // JsonToken.PropertyName = deserialize only when there's a "text": in the stream
                                case JsonToken.PropertyName:
                                    currentValue = reader.Value.ToString();
                                    break;

                                // JsonToken.String = deserialize only when there's a "text" in the stream
                                case JsonToken.String:
                                    currentList.Add(reader.Value.ToString());
                                    break;

                                // JsonToken.StartArray = deserialize only when there's "[" character in the stream
                                case JsonToken.StartArray:
                                    currentList = new List <string>();
                                    break;

                                // JsonToken.EndArray = deserialize only when there's "]" character in the stream
                                case JsonToken.EndArray:
                                    count++;

                                    // skip until we reach last word beginning
                                    if (lastWord != null)
                                    {
                                        if (currentValue.ToUpperInvariant().Equals(lastWord))
                                        {
                                            hasFound = true;
                                        }
                                    }
                                    else
                                    {
                                        hasFound = true;
                                    }

                                    // store to database
                                    if (hasFound)
                                    {
                                        // update that we are processing this word, ignore length and comment
                                        WordDatabaseService.UpdateState(db, source, new Word()
                                        {
                                            Value = currentValue.ToUpper(), Source = source, CreatedDate = DateTime.Now
                                        }, writer, true);

                                        // disable storing state since we are doing it manually above
                                        WordDatabaseService.AddToDatabase(db, source, adminUser, currentValue, currentList, writer, false);

                                        // if (writer != null) writer.WriteLine("Added '{0} => {1}'", currentValue, string.Join(",", currentList));
                                        if ((count % 10) == 0)
                                        {
                                            if (writer != null)
                                            {
                                                writer.WriteLine("[{0}] / [{1}]", count, totalCount);
                                            }
                                        }
                                    }

                                    //  and reset
                                    currentList  = null;
                                    currentValue = null;
                                    break;

                                // JsonToken.EndObject = deserialize only when there's "}" character in the stream
                                case JsonToken.EndObject:
                                    currentList  = null;
                                    currentValue = null;
                                    break;
                                }
                            }
                        }

            /*
             * // reading the whole thing took approx the same time as the streaming version
             * {
             *  var json = streamReader.ReadToEnd();
             *  var jobj = JObject.Parse(json);
             *
             *  var totalCount = jobj.Properties().Count();
             *  int count = 0;
             *  foreach (var item in jobj.Properties())
             *  {
             *      count++;
             *
             *      var currentValue = item.Name;
             *      var currentList = item.Values().Select(a => a.Value<string>());
             *
             *      WordDatabaseService.AddToDatabase(db, source, adminUser, currentValue, currentList);
             *
             *      // if (writer != null) writer.WriteLine("Added '{0} => {1}'", currentValue, string.Join(",", currentList));
             *      if (writer != null) writer.WriteLine("[{0}] / [{1}]", count, totalCount);
             *  }
             * }
             */
        }