Example #1
0
        //возвращает Null если целевая страница не найдена
        public async Task <HtmlUriDocument> SearchTargetPageFromThisPage(HtmlUriDocument checkedDocument)
        {
            // документ уже проверялся? - выход
            if (_history.Any(u => u == checkedDocument.Uri))
            {
                return(null);
            }
            _history.Add(checkedDocument.Uri);


            HtmlUriDocument resultDocument;
            var             pageType = GetPageType(checkedDocument);


            _logger.Debug($"{checkedDocument.Uri} - {pageType.ToString()}");
            IEnumerable <Uri> uriList;
            List <string>     keywords = null;

            switch (pageType)
            {
            case PageTypes.TargetPage: return(checkedDocument);

            case PageTypes.ResultNotFoundPage: return(null);

            case PageTypes.UnknownPage: return(null);

            case PageTypes.AnotherDomen: return(null);

            case PageTypes.DisambugationPage: keywords = new List <string> {
                    "moon", "planet", "asteroid", "comet"
            }; break;

            case PageTypes.LinkToDisambugationPage: keywords = new List <string> {
                    "(disambiguation)"
            }; break;

            case PageTypes.LinkToSpacePage: keywords = new List <string> {
                    "moon", "planet", "asteroid", "comet"
            }; break;
            }

            uriList = _parser.ExtractLinksByInnerText(checkedDocument.Document, keywords);
            foreach (var uri in uriList)
            {
                var nextDocument = new HtmlUriDocument {
                    Uri = uri, Document = await _parser.GetHtmlDocumentByUriAsync(uri)
                };
                resultDocument = await SearchTargetPageFromThisPage(nextDocument);

                if (resultDocument != null)
                {
                    return(resultDocument);
                }
            }

            return(null);
        }
Example #2
0
        public async Task <HtmlUriDocument> GetInitialPage(string spaceObjectName)
        {
            var uri      = new Uri(_baseUri, $"/w/index.php?search={spaceObjectName}&title=Special%3ASearch&go=Go");
            var document = new HtmlUriDocument {
                Uri = uri, Document = await _parser.GetHtmlDocumentByUriAsync(uri)
            };

            return(document);
        }
Example #3
0
        //=============================================================================================================================================

        public string GetSpaceObjectImageLink(HtmlUriDocument document)
        {
            return(document.Document.DocumentNode.SelectSingleNode("//td[@colspan='2']")?.SelectSingleNode(".//a")?.SelectSingleNode(".//img")?.Attributes["src"]?.Value); // ?? throw new Exception("На странице не нашлось изображения космического тела");
        }
Example #4
0
        public PageTypes GetPageType(HtmlUriDocument checkedDocument)
        {
            //IsNotIdentifiedPage = false;

            int i            = 0;
            var keywordsList = new List <string> {
                "Eccentricity", "Volume", "Mass", "Orbital", "Temperature", "Semi-major", "anomaly"
            };

            foreach (var keyword in keywordsList)
            {
                if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, keyword) != null)
                {
                    i++;
                }
            }
            if (i >= 3)
            {
                return(PageTypes.TargetPage);
            }

            if (checkedDocument.Document.DocumentNode.SelectSingleNode("//div[@class='catlinks']")?.SelectSingleNode(".//ul")?.
                SelectSingleNode(".//li")?.SelectSingleNode("//a[text()[contains(., 'Astronomical objects')]]") != null)
            {
                return(PageTypes.TargetPage);
            }

            //var temp = checkedDocument.Document.DocumentNode.SelectSingleNode("//p[@class='mw-search-createlink']")?.
            //    SelectSingleNode(".//a[@class='new']")?.Attributes["title"].Value;
            //if (temp!=null && temp.Contains("(page does not exist)")) return PageTypes.IsPageDoesNotExist;

            if (checkedDocument.Uri.GetLeftPart(System.UriPartial.Authority) != _baseUri.GetLeftPart(System.UriPartial.Authority))
            {
                return(PageTypes.AnotherDomen);
            }

            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "Disambiguation pages") != null)
            {
                return(PageTypes.DisambugationPage);
            }

            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(disambiguation)") != null)
            {
                return(PageTypes.LinkToDisambugationPage);
            }


            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(moon)") != null)
            {
                return(PageTypes.LinkToSpacePage);
            }
            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(planet)") != null)
            {
                return(PageTypes.LinkToSpacePage);
            }
            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(asteroid)") != null)
            {
                return(PageTypes.LinkToSpacePage);
            }
            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(comet)") != null)
            {
                return(PageTypes.LinkToSpacePage);
            }

            if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "К сожалению, по вашему запросу ничего не найдено...") != null)
            {
                return(PageTypes.ResultNotFoundPage);
            }

            return(PageTypes.UnknownPage);
        }