public async Task <IEnumerable <WordDefinition> > GetWords(string word, bool getSamples = true) { _logger.LogInformation($"Getting word information for word: {word}"); word = word.Trim().ToLower().RemoveStressMarks(); var words = new List <WordDefinition>(); using (var client = new HttpClient()) { try { _logger.LogInformation($"querying suggestings from openrussian.org for word: {word}"); var response = await client.GetAsync($"https://en.openrussian.org/suggestions?q={word}"); response.EnsureSuccessStatusCode(); var json = await response.Content.ReadAsStringAsync(); var term = JsonConvert.DeserializeObject <ORTerm>(json); var info = new ORWordInfo(); if (term.Words.Length > 0 && word.IsSameWord(term.Words[0].Ru)) { var orWord = term.Words[0]; info.Word = orWord.Ru.Trim(); info.StressedWord = orWord.RuAccented == string.Empty ? orWord.Ru.Trim() : WebUtility.HtmlDecode(orWord.RuAccented).Trim(); //var translationString = ""; //if (orWord.Translations.Length > 0) // foreach (var translation in orWord.Translations[0]) // translationString += $";{translation.Trim()}"; //info.Translation = translationString.Trim(); } if (term.Derivates.Length > 0) { info.Word = word.Trim(); info.StressedWord = word.Trim(); var derivate = term.Derivates[0]; info.Derivate = derivate.BaseBare; //if (info.Translation == string.Empty) // info.Translation = derivate.Translation.Trim(); } _logger.LogInformation($"querying openrussian.org for information about word: {word}"); var contentSegment = _serviceProvider.GetRequiredService <ContentSegment>(); contentSegment.Url = $"https://en.openrussian.org/ru/{info.Derivate}"; contentSegment.Select = "div.page"; var composition = new Composition { Return = contentSegment }; var doc = await composition.Return.DocumentElement(); var wordHeaders = doc.QuerySelectorAll("td > span.editable, h1"); foreach (var header in wordHeaders) { var wordVariant = header.TextContent.Trim(); if (word.IsSameWord(wordVariant)) { info.StressedWord = wordVariant; break; } } var wordVersions = doc.QuerySelectorAll("div.version"); foreach (var wordVersion in wordVersions) { words.Add(await CreateWordDefinition(wordVersion, info, getSamples)); } } catch (Exception ex) { _logger.LogError(ex, $"Error thrown from word provider when attempting to query information from openrussian.org for word: {word}. Stack trace: {ex.StackTrace}"); } return(words); } }
private async Task <WordDefinition> CreateWordDefinition(IElement doc, ORWordInfo info, bool getSamples = true) { var wordDefinition = new WordDefinition() { Word = new WordForm() { Word = info.Word.RemoveStressMarks(), StressedWord = info.StressedWord } }; var translationSpan = doc.QuerySelectorAll("div.translations span.editable"); if (translationSpan.Count() > 0) { wordDefinition.Translations.AddRange(translationSpan.First().TextContent.Replace(", ", ",").Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries)); } var infoDiv = doc.QuerySelectorAll("div.info"); if (infoDiv.Count() == 0) { throw new Exception("Word info counld not be found"); } GetRank(infoDiv.First(), wordDefinition); var infoDivContent = infoDiv.First().InnerHtml.ToLower(); try { var wordDetails = infoDivContent.Substring(0, infoDivContent.IndexOf("<br>")).Trim().Split(","); if (wordDetails.Length == 0) { throw new Exception($"No word details found for word: {wordDefinition.Word.Word}"); } wordDefinition.WordType = wordDetails[0].Trim(); for (int i = 1; i < wordDetails.Length; i++) { wordDefinition.Tags.Add(wordDetails[i].Trim()); } }catch (Exception ex) { _logger.LogError($"Could not find detail information for word: {info.StressedWord} "); } //nouns and adjectives var declensionDiv = doc.QuerySelectorAll("div.declension"); if (declensionDiv.Count() > 0) { int headerRowIndex = 0; var formRows = declensionDiv[0].QuerySelectorAll("tr"); for (int i = 0; i < formRows.Length; i++) { if (formRows[i].QuerySelectorAll("th").Any() && formRows[i].ParentElement.LocalName == "thead") { headerRowIndex = i; continue; } // skip column spans above the header row. This can happen for adjectives if (formRows[i].QuerySelectorAll("td").Any() && formRows[i].QuerySelector("td").HasAttribute("colspan")) { continue; } for (int j = 1; j < formRows[i].Children.Length; j++) { string formDescription = ""; if (formRows[headerRowIndex].Children[j].QuerySelectorAll("span.long").Any()) { formDescription = formRows[headerRowIndex].Children[j].QuerySelector("span.long").TextContent.Trim().ToLower() + " " + formRows[i].Children[0].QuerySelector("span.long").TextContent.Trim().ToLower(); } else { formDescription = formRows[headerRowIndex].Children[j].TextContent.Trim().ToLower() + " " + formRows[i].Children[0].QuerySelector("span.long").TextContent.Trim().ToLower(); } if (formRows[i].Children[j].TextContent.Trim() == "-") { continue; } if (formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Contains("<br>")) { wordDefinition.WordForms.Add(new WordForm { Word = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[0].Trim().RemoveStressMarks(), StressedWord = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[0].Trim(), FormDescription = formDescription }); wordDefinition.WordForms.Add(new WordForm { Word = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[1].Trim().RemoveStressMarks(), StressedWord = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[1].Trim(), FormDescription = formDescription + " (second form)" }); } else { wordDefinition.WordForms.Add(new WordForm { Word = formRows[i].Children[j].QuerySelector("span.editable").TextContent.Trim().RemoveStressMarks(), StressedWord = formRows[i].Children[j].QuerySelector("span.editable").TextContent.Trim(), FormDescription = formDescription }); } } } } //adjectives var shortFormDiv = doc.QuerySelectorAll("div.shorts"); if (shortFormDiv.Count() > 0) { var formValues = shortFormDiv[0].QuerySelectorAll("span.editable"); if (formValues.Count() == 4) { wordDefinition.WordForms.Add(new WordForm { Word = formValues[0].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[0].InnerHtml.Trim(), FormDescription = "short form masculine" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[1].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[1].InnerHtml.Trim(), FormDescription = "short form feminine" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[2].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[2].InnerHtml.Trim(), FormDescription = "short form neuter" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[3].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[3].InnerHtml.Trim(), FormDescription = "short form plural" }); } } var imperitiveFormDiv = doc.QuerySelectorAll("div.imperative"); if (imperitiveFormDiv.Count() > 0) { var imperitiveFormValues = imperitiveFormDiv[0].QuerySelectorAll("span.editable"); if (imperitiveFormValues.Count() == 2) { wordDefinition.WordForms.Add(new WordForm { Word = imperitiveFormValues[0].InnerHtml.Trim().RemoveStressMarks(), StressedWord = imperitiveFormValues[0].InnerHtml.Trim(), FormDescription = "imperative singular" }); wordDefinition.WordForms.Add(new WordForm { Word = imperitiveFormValues[1].InnerHtml.Trim().RemoveStressMarks(), StressedWord = imperitiveFormValues[1].InnerHtml.Trim(), FormDescription = "imperative plural" }); } } var pastFormDiv = doc.QuerySelectorAll("div.past"); if (pastFormDiv.Count() > 0) { var formValues = pastFormDiv[0].QuerySelectorAll("span.editable"); if (formValues.Count() == 4) { wordDefinition.WordForms.Add(new WordForm { Word = formValues[0].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[0].InnerHtml.Trim(), FormDescription = "past masculine" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[1].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[1].InnerHtml.Trim(), FormDescription = "past feminine" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[2].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[2].InnerHtml.Trim(), FormDescription = "past neuter" }); wordDefinition.WordForms.Add(new WordForm { Word = formValues[3].InnerHtml.Trim().RemoveStressMarks(), StressedWord = formValues[3].InnerHtml.Trim(), FormDescription = "past plural" }); } } var presentFutureFormDiv = doc.QuerySelectorAll("div.presfut"); if (presentFutureFormDiv.Count() > 0) { var formRows = presentFutureFormDiv[0].QuerySelectorAll("tr"); for (int i = 0; i < formRows.Length; i++) { if (i == 0) { continue; } for (int j = 1; j < formRows[i].Children.Length; j++) { var form = formRows[0].Children[j].TextContent.Trim().ToLower() + " " + MapConjegationType(formRows[i].Children[0].TextContent); if (formRows[i].Children[j].TextContent == "-") { continue; } wordDefinition.WordForms.Add(new WordForm { Word = formRows[i].Children[j].TextContent.Trim().RemoveStressMarks(), StressedWord = formRows[i].Children[j].TextContent.Trim(), FormDescription = form }); } } } if (getSamples) { wordDefinition = ScrapeAudio(wordDefinition, doc); await GetSamples(wordDefinition); } GetRelatedWords(doc, wordDefinition); return(wordDefinition); }