Пример #1
0
        private IList <Word> ReadWordsAgilityPack(HtmlNode node, User adminUser)
        {
            var tableRows = node.FindNodes(By.XPath("/html/body//div[@class='results']/table/tbody/tr"));

            var wordListing = new List <Word>();

            foreach (var row in tableRows)
            {
                var rowTD = row.FindNodes(By.TagName("td"));

                var wordText = rowTD[0].InnerText.Trim().ToUpper();
                var userId   = rowTD[3].InnerText.Trim();
                var date     = rowTD[4].InnerText.Trim();

                var word = new Word
                {
                    Language        = "no",
                    Value           = wordText,
                    NumberOfLetters = ScraperUtils.CountNumberOfLetters(wordText),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(wordText),
                    User            = adminUser,
                    CreatedDate     = ScraperUtils.ParseDateTimeOrNow(date, "yyyy-MM-dd"),
                    Source          = this.source,
                    Comment         = "User " + userId
                };

                wordListing.Add(word);
            }

            return(wordListing);
        }
Пример #2
0
        private List <(Word, string)> ParseWordsAgilityPack(HtmlNode doc, User adminUser)
        {
            var words = new List <(Word, string)>();

            // https://www.gratiskryssord.no/kryssordbok/alfabetisk/aa/
            var ahrefs = doc.FindNodes(By.XPath("//div[@id='staticPage']//a[starts-with(@href, '/kryssordbok/')]"));

            if (ahrefs == null)
            {
                return(words);
            }

            foreach (var ahref in ahrefs)
            {
                var wordText = ahref.InnerText.Trim().ToUpper();
                wordText = HttpUtility.HtmlDecode(wordText); // ensure that text like &amp; gets converted to &
                var    href = ahref.Attributes["href"].Value;
                string url  = $"https://www.gratiskryssord.no{href}";

                var word = new Word
                {
                    Language        = "no",
                    Value           = wordText,
                    NumberOfLetters = wordText.Count(c => c != ' '),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(wordText),
                    User            = adminUser,
                    CreatedDate     = DateTime.Now,
                    Source          = this.source
                };

                words.Add((word, url));
            }

            return(words);
        }
Пример #3
0
        private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser)
        {
            // there is a bug in the website that makes a  query with "0" fail
            if (word.Value == "0")
            {
                return;
            }

            // open a new tab and set the context
            var chromeDriver = (ChromeDriver)driver;

            // save a reference to our original tab's window handle
            var originalTabInstance = chromeDriver.CurrentWindowHandle;

            // execute some JavaScript to open a new window
            chromeDriver.ExecuteScript("window.open();");

            // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection
            var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1];

            // switch our WebDriver to the new tab's window handle
            chromeDriver.SwitchTo().Window(newTabInstance);

            // lets navigate to a web site in our new tab
            var    wordPattern = "";
            var    query       = ScraperUtils.EscapeUrlString(word.Value);
            int    page        = 0;
            string url         = string.Format("{0}?a={1}&b={2}&p={3}", "https://www.kryssord.org/search.php", query, wordPattern, page);

            var(count, documentNode) = GetWordCountByWordPattern(driver, url);
            if (count == 0)
            {
                return;
            }
            else
            {
                Log.Information("Found {0} synonyms when searching for '{1}' on page {2}", count, word.Value, page + 1);
                writer.WriteLine("Found {0} synonyms when searching for '{1}' on page {2}", count, word.Value, page + 1);

                if (count > 108)
                {
                    Log.Error("Warning! synonym search for '{0}' on page {1} has too many words: {2}", word.Value, page + 1, count);
                }
            }

            ProcessSynonymsUntilEmpty(word, driver, db, adminUser, page, documentNode, url);

            // now lets close our new tab
            chromeDriver.ExecuteScript("window.close();");

            // and switch our WebDriver back to the original tab's window handle
            chromeDriver.SwitchTo().Window(originalTabInstance);

            // and have our WebDriver focus on the main document in the page to send commands to
            chromeDriver.SwitchTo().DefaultContent();
        }
Пример #4
0
        private IList <Word> ReadRelatedWordsAgilityPack(HtmlNode node, User adminUser)
        {
            // parse all related words
            var tableRows = node.FindNodes(By.XPath("/html/body//div[@class='results']/table/tbody/tr"));

            var relatedWords = new List <Word>();

            if (tableRows == null)
            {
                return(relatedWords);
            }

            foreach (var row in tableRows)
            {
                var rowTD    = row.FindNodes(By.TagName("td"));
                var hintText = rowTD[0].InnerText.Trim().ToUpper();
                var userId   = rowTD[3].InnerText.Trim();
                var date     = rowTD[4].InnerText.Trim();

                var hint = new Word
                {
                    Language        = "no",
                    Value           = hintText,
                    NumberOfLetters = ScraperUtils.CountNumberOfLetters(hintText),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(hintText),
                    User            = adminUser,
                    CreatedDate     = ScraperUtils.ParseDateTimeOrNow(date, "yyyy-MM-dd"),
                    Source          = this.source,
                    Comment         = "User " + userId
                };

                relatedWords.Add(hint);
            }

            relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word>

            return(relatedWords);
        }
Пример #5
0
        private List <Word> ParseSynonymsAgilityPack(Word word, HtmlNode doc, User adminUser)
        {
            // parse all synonyms
            var relatedWords = new List <Word>();

            // https://www.gratiskryssord.no/kryssordbok/
            var ahrefs = doc.FindNodes(By.XPath("//div[@id='staticPage']//a[starts-with(@href, '/kryssordbok/')]"));

            if (ahrefs == null)
            {
                return(relatedWords);
            }

            foreach (var ahref in ahrefs)
            {
                var hintText = ahref.InnerText.Trim().ToUpper();
                hintText = HttpUtility.HtmlDecode(hintText); // ensure that text like &amp; gets converted to &
                var href = ahref.Attributes["href"].Value;
                // string url = $"https://www.gratiskryssord.no{href}";

                var hint = new Word
                {
                    Language        = "no",
                    Value           = hintText,
                    NumberOfLetters = hintText.Count(c => c != ' '),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(hintText),
                    User            = adminUser,
                    CreatedDate     = DateTime.Now,
                    Source          = this.source
                };

                relatedWords.Add(hint);
            }

            relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word>
            return(relatedWords);
        }
Пример #6
0
        private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url)
        {
            // there is a bug in the website that makes a  query with "0" fail
            if (word.Value == "0")
            {
                return;
            }

            // open a new tab and set the context
            var chromeDriver = (ChromeDriver)driver;

            // save a reference to our original tab's window handle
            var originalTabInstance = chromeDriver.CurrentWindowHandle;

            // execute some JavaScript to open a new window
            chromeDriver.ExecuteScript("window.open();");

            // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection
            var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1];

            // switch our WebDriver to the new tab's window handle
            chromeDriver.SwitchTo().Window(newTabInstance);

            // lets navigate to a web site in our new tab
            driver.Navigate().GoToUrl(url);

            Log.Information("Processing synonym search for '{0}'", word.Value);
            writer.WriteLine("Processing synonym search for '{0}'", word.Value);

            // parse all synonyms
            IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li"));
            IWebElement         ahref        = null;

            var relatedWords = new List <Word>();

            foreach (IWebElement listElement in listElements)
            {
                try
                {
                    ahref = listElement.FindElement(By.TagName("a"));
                }
                catch (NoSuchElementException)
                {
                    break;
                }

                var hintText = ahref.Text;
                var href     = ahref.GetAttribute("href");

                var hint = new Word
                {
                    Language        = "no",
                    Value           = hintText,
                    NumberOfLetters = hintText.Count(c => c != ' '),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(hintText),
                    User            = adminUser,
                    CreatedDate     = DateTime.Now,
                    Source          = this.source
                };

                relatedWords.Add(hint);
            }

            relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word>

            // and add to database
            WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer);

            // now lets close our new tab
            chromeDriver.ExecuteScript("window.close();");

            // and switch our WebDriver back to the original tab's window handle
            chromeDriver.SwitchTo().Window(originalTabInstance);

            // and have our WebDriver focus on the main document in the page to send commands to
            chromeDriver.SwitchTo().DefaultContent();
        }
Пример #7
0
        private void ReadWordsByWordPattern(string wordPattern, IWebDriver driver, WordHintDbContext db, User adminUser)
        {
            // go to search page
            string url = "https://kryssordhjelp.no/";

            driver.Navigate().GoToUrl(url);

            Log.Information("Processing pattern search for '{0}'", wordPattern);
            writer.WriteLine("Processing pattern search for '{0}'", wordPattern);

            // select the drop down list
            var lengthElement = driver.FindElement(By.Name("length"));

            // create select element object
            var selectElement = new SelectElement(lengthElement);

            // select by value
            selectElement.SelectByValue(wordPattern.Length.ToString());

            // select the letter fields
            var letter1 = driver.FindElement(By.Name("letter[1]"));

            letter1.SendKeys(wordPattern[0].ToString());
            var letter2 = driver.FindElement(By.Name("letter[2]"));

            letter2.SendKeys(wordPattern[1].ToString());

            // find submit button
            var login = driver.FindElement(By.Id("submitsearch"));

            login.Click();

            // wait until the word list has loaded
            try
            {
                driver.WaitForElementLoad(By.XPath("//div[@id='wordlist']/ul[@class='word']/li"), 20);
            }
            catch (System.Exception)
            {
                Log.Error("Timeout searching for '{0}'", wordPattern);
                writer.WriteLine("Timeout searching for '{0}'", wordPattern);
                return;
            }

            // parse all words
            IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li"));
            IWebElement         ahref        = null;

            foreach (IWebElement listElement in listElements)
            {
                try
                {
                    ahref = listElement.FindElement(By.TagName("a"));
                }
                catch (NoSuchElementException)
                {
                    break;
                }

                var wordText = ahref.Text;
                var href     = ahref.GetAttribute("href");

                var word = new Word
                {
                    Language        = "no",
                    Value           = wordText,
                    NumberOfLetters = wordText.Count(c => c != ' '),
                    NumberOfWords   = ScraperUtils.CountNumberOfWords(wordText),
                    User            = adminUser,
                    CreatedDate     = DateTime.Now,
                    Source          = this.source
                };

                GetWordSynonyms(word, driver, db, adminUser, href);
            }
        }