private IList <Word> ReadWordsAgilityPack(HtmlNode node, User adminUser) { var tableRows = node.FindNodes(By.XPath("/html/body//div[@class='results']/table/tbody/tr")); var wordListing = new List <Word>(); foreach (var row in tableRows) { var rowTD = row.FindNodes(By.TagName("td")); var wordText = rowTD[0].InnerText.Trim().ToUpper(); var userId = rowTD[3].InnerText.Trim(); var date = rowTD[4].InnerText.Trim(); var word = new Word { Language = "no", Value = wordText, NumberOfLetters = ScraperUtils.CountNumberOfLetters(wordText), NumberOfWords = ScraperUtils.CountNumberOfWords(wordText), User = adminUser, CreatedDate = ScraperUtils.ParseDateTimeOrNow(date, "yyyy-MM-dd"), Source = this.source, Comment = "User " + userId }; wordListing.Add(word); } return(wordListing); }
private List <(Word, string)> ParseWordsAgilityPack(HtmlNode doc, User adminUser) { var words = new List <(Word, string)>(); // https://www.gratiskryssord.no/kryssordbok/alfabetisk/aa/ var ahrefs = doc.FindNodes(By.XPath("//div[@id='staticPage']//a[starts-with(@href, '/kryssordbok/')]")); if (ahrefs == null) { return(words); } foreach (var ahref in ahrefs) { var wordText = ahref.InnerText.Trim().ToUpper(); wordText = HttpUtility.HtmlDecode(wordText); // ensure that text like & gets converted to & var href = ahref.Attributes["href"].Value; string url = $"https://www.gratiskryssord.no{href}"; var word = new Word { Language = "no", Value = wordText, NumberOfLetters = wordText.Count(c => c != ' '), NumberOfWords = ScraperUtils.CountNumberOfWords(wordText), User = adminUser, CreatedDate = DateTime.Now, Source = this.source }; words.Add((word, url)); } return(words); }
private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser) { // there is a bug in the website that makes a query with "0" fail if (word.Value == "0") { return; } // open a new tab and set the context var chromeDriver = (ChromeDriver)driver; // save a reference to our original tab's window handle var originalTabInstance = chromeDriver.CurrentWindowHandle; // execute some JavaScript to open a new window chromeDriver.ExecuteScript("window.open();"); // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1]; // switch our WebDriver to the new tab's window handle chromeDriver.SwitchTo().Window(newTabInstance); // lets navigate to a web site in our new tab var wordPattern = ""; var query = ScraperUtils.EscapeUrlString(word.Value); int page = 0; string url = string.Format("{0}?a={1}&b={2}&p={3}", "https://www.kryssord.org/search.php", query, wordPattern, page); var(count, documentNode) = GetWordCountByWordPattern(driver, url); if (count == 0) { return; } else { Log.Information("Found {0} synonyms when searching for '{1}' on page {2}", count, word.Value, page + 1); writer.WriteLine("Found {0} synonyms when searching for '{1}' on page {2}", count, word.Value, page + 1); if (count > 108) { Log.Error("Warning! synonym search for '{0}' on page {1} has too many words: {2}", word.Value, page + 1, count); } } ProcessSynonymsUntilEmpty(word, driver, db, adminUser, page, documentNode, url); // now lets close our new tab chromeDriver.ExecuteScript("window.close();"); // and switch our WebDriver back to the original tab's window handle chromeDriver.SwitchTo().Window(originalTabInstance); // and have our WebDriver focus on the main document in the page to send commands to chromeDriver.SwitchTo().DefaultContent(); }
private IList <Word> ReadRelatedWordsAgilityPack(HtmlNode node, User adminUser) { // parse all related words var tableRows = node.FindNodes(By.XPath("/html/body//div[@class='results']/table/tbody/tr")); var relatedWords = new List <Word>(); if (tableRows == null) { return(relatedWords); } foreach (var row in tableRows) { var rowTD = row.FindNodes(By.TagName("td")); var hintText = rowTD[0].InnerText.Trim().ToUpper(); var userId = rowTD[3].InnerText.Trim(); var date = rowTD[4].InnerText.Trim(); var hint = new Word { Language = "no", Value = hintText, NumberOfLetters = ScraperUtils.CountNumberOfLetters(hintText), NumberOfWords = ScraperUtils.CountNumberOfWords(hintText), User = adminUser, CreatedDate = ScraperUtils.ParseDateTimeOrNow(date, "yyyy-MM-dd"), Source = this.source, Comment = "User " + userId }; relatedWords.Add(hint); } relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word> return(relatedWords); }
private List <Word> ParseSynonymsAgilityPack(Word word, HtmlNode doc, User adminUser) { // parse all synonyms var relatedWords = new List <Word>(); // https://www.gratiskryssord.no/kryssordbok/ var ahrefs = doc.FindNodes(By.XPath("//div[@id='staticPage']//a[starts-with(@href, '/kryssordbok/')]")); if (ahrefs == null) { return(relatedWords); } foreach (var ahref in ahrefs) { var hintText = ahref.InnerText.Trim().ToUpper(); hintText = HttpUtility.HtmlDecode(hintText); // ensure that text like & gets converted to & var href = ahref.Attributes["href"].Value; // string url = $"https://www.gratiskryssord.no{href}"; var hint = new Word { Language = "no", Value = hintText, NumberOfLetters = hintText.Count(c => c != ' '), NumberOfWords = ScraperUtils.CountNumberOfWords(hintText), User = adminUser, CreatedDate = DateTime.Now, Source = this.source }; relatedWords.Add(hint); } relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word> return(relatedWords); }
private void GetWordSynonyms(Word word, IWebDriver driver, WordHintDbContext db, User adminUser, string url) { // there is a bug in the website that makes a query with "0" fail if (word.Value == "0") { return; } // open a new tab and set the context var chromeDriver = (ChromeDriver)driver; // save a reference to our original tab's window handle var originalTabInstance = chromeDriver.CurrentWindowHandle; // execute some JavaScript to open a new window chromeDriver.ExecuteScript("window.open();"); // save a reference to our new tab's window handle, this would be the last entry in the WindowHandles collection var newTabInstance = chromeDriver.WindowHandles[driver.WindowHandles.Count - 1]; // switch our WebDriver to the new tab's window handle chromeDriver.SwitchTo().Window(newTabInstance); // lets navigate to a web site in our new tab driver.Navigate().GoToUrl(url); Log.Information("Processing synonym search for '{0}'", word.Value); writer.WriteLine("Processing synonym search for '{0}'", word.Value); // parse all synonyms IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li")); IWebElement ahref = null; var relatedWords = new List <Word>(); foreach (IWebElement listElement in listElements) { try { ahref = listElement.FindElement(By.TagName("a")); } catch (NoSuchElementException) { break; } var hintText = ahref.Text; var href = ahref.GetAttribute("href"); var hint = new Word { Language = "no", Value = hintText, NumberOfLetters = hintText.Count(c => c != ' '), NumberOfWords = ScraperUtils.CountNumberOfWords(hintText), User = adminUser, CreatedDate = DateTime.Now, Source = this.source }; relatedWords.Add(hint); } relatedWords = relatedWords.Distinct().ToList(); // Note that this requires the object to implement IEquatable<Word> // and add to database WordDatabaseService.AddToDatabase(db, this.source, word, relatedWords, writer); // now lets close our new tab chromeDriver.ExecuteScript("window.close();"); // and switch our WebDriver back to the original tab's window handle chromeDriver.SwitchTo().Window(originalTabInstance); // and have our WebDriver focus on the main document in the page to send commands to chromeDriver.SwitchTo().DefaultContent(); }
private void ReadWordsByWordPattern(string wordPattern, IWebDriver driver, WordHintDbContext db, User adminUser) { // go to search page string url = "https://kryssordhjelp.no/"; driver.Navigate().GoToUrl(url); Log.Information("Processing pattern search for '{0}'", wordPattern); writer.WriteLine("Processing pattern search for '{0}'", wordPattern); // select the drop down list var lengthElement = driver.FindElement(By.Name("length")); // create select element object var selectElement = new SelectElement(lengthElement); // select by value selectElement.SelectByValue(wordPattern.Length.ToString()); // select the letter fields var letter1 = driver.FindElement(By.Name("letter[1]")); letter1.SendKeys(wordPattern[0].ToString()); var letter2 = driver.FindElement(By.Name("letter[2]")); letter2.SendKeys(wordPattern[1].ToString()); // find submit button var login = driver.FindElement(By.Id("submitsearch")); login.Click(); // wait until the word list has loaded try { driver.WaitForElementLoad(By.XPath("//div[@id='wordlist']/ul[@class='word']/li"), 20); } catch (System.Exception) { Log.Error("Timeout searching for '{0}'", wordPattern); writer.WriteLine("Timeout searching for '{0}'", wordPattern); return; } // parse all words IList <IWebElement> listElements = driver.FindElements(By.XPath("//div[@id='wordlist']/ul[@class='word']/li")); IWebElement ahref = null; foreach (IWebElement listElement in listElements) { try { ahref = listElement.FindElement(By.TagName("a")); } catch (NoSuchElementException) { break; } var wordText = ahref.Text; var href = ahref.GetAttribute("href"); var word = new Word { Language = "no", Value = wordText, NumberOfLetters = wordText.Count(c => c != ' '), NumberOfWords = ScraperUtils.CountNumberOfWords(wordText), User = adminUser, CreatedDate = DateTime.Now, Source = this.source }; GetWordSynonyms(word, driver, db, adminUser, href); } }