Ejemplo n.º 1
0
        public async Task <IEnumerable <WordDefinition> > GetWords(string word, bool getSamples = true)
        {
            _logger.LogInformation($"Getting word information for word: {word}");
            word = word.Trim().ToLower().RemoveStressMarks();

            var words = new List <WordDefinition>();

            using (var client = new HttpClient())
            {
                try
                {
                    _logger.LogInformation($"querying suggestings from openrussian.org for word: {word}");
                    var response = await client.GetAsync($"https://en.openrussian.org/suggestions?q={word}");

                    response.EnsureSuccessStatusCode();
                    var json = await response.Content.ReadAsStringAsync();

                    var term = JsonConvert.DeserializeObject <ORTerm>(json);
                    var info = new ORWordInfo();

                    if (term.Words.Length > 0 && word.IsSameWord(term.Words[0].Ru))
                    {
                        var orWord = term.Words[0];
                        info.Word         = orWord.Ru.Trim();
                        info.StressedWord = orWord.RuAccented == string.Empty ? orWord.Ru.Trim() : WebUtility.HtmlDecode(orWord.RuAccented).Trim();

                        //var translationString = "";
                        //if (orWord.Translations.Length > 0)
                        //    foreach (var translation in orWord.Translations[0])
                        //        translationString += $";{translation.Trim()}";
                        //info.Translation = translationString.Trim();
                    }
                    if (term.Derivates.Length > 0)
                    {
                        info.Word         = word.Trim();
                        info.StressedWord = word.Trim();
                        var derivate = term.Derivates[0];
                        info.Derivate = derivate.BaseBare;
                        //if (info.Translation == string.Empty)
                        //    info.Translation = derivate.Translation.Trim();
                    }


                    _logger.LogInformation($"querying openrussian.org for information about word: {word}");
                    var contentSegment = _serviceProvider.GetRequiredService <ContentSegment>();
                    contentSegment.Url    = $"https://en.openrussian.org/ru/{info.Derivate}";
                    contentSegment.Select = "div.page";
                    var composition = new Composition {
                        Return = contentSegment
                    };
                    var doc = await composition.Return.DocumentElement();

                    var wordHeaders = doc.QuerySelectorAll("td > span.editable, h1");
                    foreach (var header in wordHeaders)
                    {
                        var wordVariant = header.TextContent.Trim();
                        if (word.IsSameWord(wordVariant))
                        {
                            info.StressedWord = wordVariant;
                            break;
                        }
                    }


                    var wordVersions = doc.QuerySelectorAll("div.version");
                    foreach (var wordVersion in wordVersions)
                    {
                        words.Add(await CreateWordDefinition(wordVersion, info, getSamples));
                    }
                }
                catch (Exception ex) {
                    _logger.LogError(ex, $"Error thrown from word provider when attempting to query information from openrussian.org for word: {word}.  Stack trace: {ex.StackTrace}");
                }
                return(words);
            }
        }
Ejemplo n.º 2
0
        private async Task <WordDefinition> CreateWordDefinition(IElement doc, ORWordInfo info, bool getSamples = true)
        {
            var wordDefinition = new WordDefinition()
            {
                Word = new WordForm()
                {
                    Word         = info.Word.RemoveStressMarks(),
                    StressedWord = info.StressedWord
                }
            };

            var translationSpan = doc.QuerySelectorAll("div.translations span.editable");

            if (translationSpan.Count() > 0)
            {
                wordDefinition.Translations.AddRange(translationSpan.First().TextContent.Replace(", ", ",").Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries));
            }

            var infoDiv = doc.QuerySelectorAll("div.info");

            if (infoDiv.Count() == 0)
            {
                throw new Exception("Word info counld not be found");
            }

            GetRank(infoDiv.First(), wordDefinition);


            var infoDivContent = infoDiv.First().InnerHtml.ToLower();

            try
            {
                var wordDetails = infoDivContent.Substring(0, infoDivContent.IndexOf("<br>")).Trim().Split(",");

                if (wordDetails.Length == 0)
                {
                    throw new Exception($"No word details found for word: {wordDefinition.Word.Word}");
                }

                wordDefinition.WordType = wordDetails[0].Trim();

                for (int i = 1; i < wordDetails.Length; i++)
                {
                    wordDefinition.Tags.Add(wordDetails[i].Trim());
                }
            }catch (Exception ex)
            {
                _logger.LogError($"Could not find detail information for word: {info.StressedWord} ");
            }

            //nouns and adjectives
            var declensionDiv = doc.QuerySelectorAll("div.declension");

            if (declensionDiv.Count() > 0)
            {
                int headerRowIndex = 0;
                var formRows       = declensionDiv[0].QuerySelectorAll("tr");
                for (int i = 0; i < formRows.Length; i++)
                {
                    if (formRows[i].QuerySelectorAll("th").Any() && formRows[i].ParentElement.LocalName == "thead")
                    {
                        headerRowIndex = i;
                        continue;
                    }
                    // skip column spans above the header row.  This can happen for adjectives
                    if (formRows[i].QuerySelectorAll("td").Any() && formRows[i].QuerySelector("td").HasAttribute("colspan"))
                    {
                        continue;
                    }

                    for (int j = 1; j < formRows[i].Children.Length; j++)
                    {
                        string formDescription = "";
                        if (formRows[headerRowIndex].Children[j].QuerySelectorAll("span.long").Any())
                        {
                            formDescription = formRows[headerRowIndex].Children[j].QuerySelector("span.long").TextContent.Trim().ToLower() + " " + formRows[i].Children[0].QuerySelector("span.long").TextContent.Trim().ToLower();
                        }
                        else
                        {
                            formDescription = formRows[headerRowIndex].Children[j].TextContent.Trim().ToLower() + " " + formRows[i].Children[0].QuerySelector("span.long").TextContent.Trim().ToLower();
                        }
                        if (formRows[i].Children[j].TextContent.Trim() == "-")
                        {
                            continue;
                        }
                        if (formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Contains("<br>"))
                        {
                            wordDefinition.WordForms.Add(new WordForm
                            {
                                Word            = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[0].Trim().RemoveStressMarks(),
                                StressedWord    = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[0].Trim(),
                                FormDescription = formDescription
                            });
                            wordDefinition.WordForms.Add(new WordForm
                            {
                                Word            = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[1].Trim().RemoveStressMarks(),
                                StressedWord    = formRows[i].Children[j].QuerySelector("span.editable").InnerHtml.Split("<br>")[1].Trim(),
                                FormDescription = formDescription + " (second form)"
                            });
                        }
                        else
                        {
                            wordDefinition.WordForms.Add(new WordForm
                            {
                                Word            = formRows[i].Children[j].QuerySelector("span.editable").TextContent.Trim().RemoveStressMarks(),
                                StressedWord    = formRows[i].Children[j].QuerySelector("span.editable").TextContent.Trim(),
                                FormDescription = formDescription
                            });
                        }
                    }
                }
            }

            //adjectives
            var shortFormDiv = doc.QuerySelectorAll("div.shorts");

            if (shortFormDiv.Count() > 0)
            {
                var formValues = shortFormDiv[0].QuerySelectorAll("span.editable");
                if (formValues.Count() == 4)
                {
                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[0].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[0].InnerHtml.Trim(),
                        FormDescription = "short form masculine"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[1].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[1].InnerHtml.Trim(),
                        FormDescription = "short form feminine"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[2].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[2].InnerHtml.Trim(),
                        FormDescription = "short form neuter"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[3].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[3].InnerHtml.Trim(),
                        FormDescription = "short form plural"
                    });
                }
            }

            var imperitiveFormDiv = doc.QuerySelectorAll("div.imperative");

            if (imperitiveFormDiv.Count() > 0)
            {
                var imperitiveFormValues = imperitiveFormDiv[0].QuerySelectorAll("span.editable");
                if (imperitiveFormValues.Count() == 2)
                {
                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = imperitiveFormValues[0].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = imperitiveFormValues[0].InnerHtml.Trim(),
                        FormDescription = "imperative singular"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = imperitiveFormValues[1].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = imperitiveFormValues[1].InnerHtml.Trim(),
                        FormDescription = "imperative plural"
                    });
                }
            }



            var pastFormDiv = doc.QuerySelectorAll("div.past");

            if (pastFormDiv.Count() > 0)
            {
                var formValues = pastFormDiv[0].QuerySelectorAll("span.editable");
                if (formValues.Count() == 4)
                {
                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[0].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[0].InnerHtml.Trim(),
                        FormDescription = "past masculine"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[1].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[1].InnerHtml.Trim(),
                        FormDescription = "past feminine"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[2].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[2].InnerHtml.Trim(),
                        FormDescription = "past neuter"
                    });

                    wordDefinition.WordForms.Add(new WordForm
                    {
                        Word            = formValues[3].InnerHtml.Trim().RemoveStressMarks(),
                        StressedWord    = formValues[3].InnerHtml.Trim(),
                        FormDescription = "past plural"
                    });
                }
            }

            var presentFutureFormDiv = doc.QuerySelectorAll("div.presfut");

            if (presentFutureFormDiv.Count() > 0)
            {
                var formRows = presentFutureFormDiv[0].QuerySelectorAll("tr");
                for (int i = 0; i < formRows.Length; i++)
                {
                    if (i == 0)
                    {
                        continue;
                    }

                    for (int j = 1; j < formRows[i].Children.Length; j++)
                    {
                        var form = formRows[0].Children[j].TextContent.Trim().ToLower() + " " + MapConjegationType(formRows[i].Children[0].TextContent);
                        if (formRows[i].Children[j].TextContent == "-")
                        {
                            continue;
                        }
                        wordDefinition.WordForms.Add(new WordForm
                        {
                            Word            = formRows[i].Children[j].TextContent.Trim().RemoveStressMarks(),
                            StressedWord    = formRows[i].Children[j].TextContent.Trim(),
                            FormDescription = form
                        });
                    }
                }
            }

            if (getSamples)
            {
                wordDefinition = ScrapeAudio(wordDefinition, doc);

                await GetSamples(wordDefinition);
            }

            GetRelatedWords(doc, wordDefinition);

            return(wordDefinition);
        }