//возвращает Null если целевая страница не найдена public async Task <HtmlUriDocument> SearchTargetPageFromThisPage(HtmlUriDocument checkedDocument) { // документ уже проверялся? - выход if (_history.Any(u => u == checkedDocument.Uri)) { return(null); } _history.Add(checkedDocument.Uri); HtmlUriDocument resultDocument; var pageType = GetPageType(checkedDocument); _logger.Debug($"{checkedDocument.Uri} - {pageType.ToString()}"); IEnumerable <Uri> uriList; List <string> keywords = null; switch (pageType) { case PageTypes.TargetPage: return(checkedDocument); case PageTypes.ResultNotFoundPage: return(null); case PageTypes.UnknownPage: return(null); case PageTypes.AnotherDomen: return(null); case PageTypes.DisambugationPage: keywords = new List <string> { "moon", "planet", "asteroid", "comet" }; break; case PageTypes.LinkToDisambugationPage: keywords = new List <string> { "(disambiguation)" }; break; case PageTypes.LinkToSpacePage: keywords = new List <string> { "moon", "planet", "asteroid", "comet" }; break; } uriList = _parser.ExtractLinksByInnerText(checkedDocument.Document, keywords); foreach (var uri in uriList) { var nextDocument = new HtmlUriDocument { Uri = uri, Document = await _parser.GetHtmlDocumentByUriAsync(uri) }; resultDocument = await SearchTargetPageFromThisPage(nextDocument); if (resultDocument != null) { return(resultDocument); } } return(null); }
public async Task <HtmlUriDocument> GetInitialPage(string spaceObjectName) { var uri = new Uri(_baseUri, $"/w/index.php?search={spaceObjectName}&title=Special%3ASearch&go=Go"); var document = new HtmlUriDocument { Uri = uri, Document = await _parser.GetHtmlDocumentByUriAsync(uri) }; return(document); }
//============================================================================================================================================= public string GetSpaceObjectImageLink(HtmlUriDocument document) { return(document.Document.DocumentNode.SelectSingleNode("//td[@colspan='2']")?.SelectSingleNode(".//a")?.SelectSingleNode(".//img")?.Attributes["src"]?.Value); // ?? throw new Exception("На странице не нашлось изображения космического тела"); }
public PageTypes GetPageType(HtmlUriDocument checkedDocument) { //IsNotIdentifiedPage = false; int i = 0; var keywordsList = new List <string> { "Eccentricity", "Volume", "Mass", "Orbital", "Temperature", "Semi-major", "anomaly" }; foreach (var keyword in keywordsList) { if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, keyword) != null) { i++; } } if (i >= 3) { return(PageTypes.TargetPage); } if (checkedDocument.Document.DocumentNode.SelectSingleNode("//div[@class='catlinks']")?.SelectSingleNode(".//ul")?. SelectSingleNode(".//li")?.SelectSingleNode("//a[text()[contains(., 'Astronomical objects')]]") != null) { return(PageTypes.TargetPage); } //var temp = checkedDocument.Document.DocumentNode.SelectSingleNode("//p[@class='mw-search-createlink']")?. // SelectSingleNode(".//a[@class='new']")?.Attributes["title"].Value; //if (temp!=null && temp.Contains("(page does not exist)")) return PageTypes.IsPageDoesNotExist; if (checkedDocument.Uri.GetLeftPart(System.UriPartial.Authority) != _baseUri.GetLeftPart(System.UriPartial.Authority)) { return(PageTypes.AnotherDomen); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "Disambiguation pages") != null) { return(PageTypes.DisambugationPage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(disambiguation)") != null) { return(PageTypes.LinkToDisambugationPage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(moon)") != null) { return(PageTypes.LinkToSpacePage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(planet)") != null) { return(PageTypes.LinkToSpacePage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(asteroid)") != null) { return(PageTypes.LinkToSpacePage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "(comet)") != null) { return(PageTypes.LinkToSpacePage); } if (_parser.FindNodesByTagAndInnerText(checkedDocument.Document, "К сожалению, по вашему запросу ничего не найдено...") != null) { return(PageTypes.ResultNotFoundPage); } return(PageTypes.UnknownPage); }