public LinkAndInfo ResolveLink(LinkToResolve link) { if (link.ContentType.MediaType != "text/html" && link.ContentType.MediaType != "application/xhtml+xml") { return(null); } // HTML? parse it and get the title var respStr = EncodingGuesser.GuessEncodingAndDecode(link.ResponseBytes, link.ContentType); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(respStr); var titleElement = htmlDoc.DocumentNode.SelectSingleNode(".//title"); if (titleElement != null) { return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(titleElement.InnerText)).Trim())); } var h1Element = htmlDoc.DocumentNode.SelectSingleNode(".//h1"); if (h1Element != null) { return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(h1Element.InnerText)).Trim())); } return(link.ToResult(FetchErrorLevel.Success, "(HTML without a title O_o)")); }
void LoadFromPage(long page) { var pageUri = new Uri(string.Format(CultureInfo.InvariantCulture, Z0rIndexUriFormat, page)); byte[] indexPageBytes; using (var client = new HttpClient()) using (var request = new HttpRequestMessage(HttpMethod.Get, pageUri)) { client.Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds); request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { indexPageBytes = response.Content.ReadAsByteArrayAsync().SyncWait(); } } string indexPageString = EncodingGuesser.GuessEncodingAndDecode(indexPageBytes, null); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(indexPageString); HtmlNode indexTable = htmlDoc.GetElementbyId("zebra"); IEnumerable <HtmlNode> foundRows = indexTable .SelectNodes(".//tr") .OfType <HtmlNode>(); foreach (HtmlNode foundRow in foundRows) { List <HtmlNode> cells = foundRow .ChildNodes .OfType <HtmlNode>() .Where(n => n.Name == "th") .ToList(); if (cells.Count != 5) { continue; } string idString = TrimmedInnerTextOrNull(cells[0]); long id; if (!long.TryParse(idString, NumberStyles.None, CultureInfo.InvariantCulture, out id)) { continue; } string artist = TrimmedInnerTextOrNull(cells[1]); string song = TrimmedInnerTextOrNull(cells[2]); string image = TrimmedInnerTextOrNull(cells[3]); string tag = TrimmedInnerTextOrNull(cells[4]); EntryCache[id] = new Z0rEntry(id, artist, song, image, tag); } }
long?ObtainMaxPageValue() { // obtain the index homepage byte[] indexHomepageBytes; using (var client = new HttpClient()) using (var request = new HttpRequestMessage(HttpMethod.Get, Z0rIndexHomepageUri)) { client.Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds); request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { indexHomepageBytes = response.Content.ReadAsByteArrayAsync().SyncWait(); } } string indexHomepageString = EncodingGuesser.GuessEncodingAndDecode(indexHomepageBytes, null); long?currentMaxPage = null; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(indexHomepageString); IEnumerable <Match> foundLinkMatches = htmlDoc.DocumentNode .SelectNodes(".//a") .OfType <HtmlNode>() .Select(n => PageHrefPattern.Match(n.GetAttributeValue("href", ""))) .Where(m => m.Success); foreach (Match foundLinkMatch in foundLinkMatches) { long page; if (!long.TryParse(foundLinkMatch.Groups["page"].Value, NumberStyles.None, CultureInfo.InvariantCulture, out page)) { continue; } if (!currentMaxPage.HasValue || currentMaxPage.Value < page) { currentMaxPage = page; } } return(currentMaxPage); }
string ResolveLinkText(LinkToResolve link, string typeDescription) { try { var client = new HttpClient { Timeout = TimeSpan.FromSeconds(Config.ImageInfoTimeoutSeconds) }; var googleImageSearchUrl = new Uri(string.Format(GoogleImageSearchUrlPattern, LinkInfoConfig.GoogleDomain)); // alibi-visit the image search page to get the cookies using (var request = new HttpRequestMessage(HttpMethod.Get, googleImageSearchUrl)) { request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); request.Headers.Referrer = new Uri(string.Format(GoogleHomepageUrlPattern, LinkInfoConfig.GoogleDomain)); using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { response.Content.ReadAsByteArrayAsync().SyncWait(); } } // fetch the actual info var searchUrl = new Uri(string.Format( GoogleImageSearchByImageUrlPattern, LinkInfoConfig.GoogleDomain, link.Link.AbsoluteUri )); byte[] responseBytes; using (var request = new HttpRequestMessage(HttpMethod.Get, searchUrl)) { request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); request.Headers.Referrer = googleImageSearchUrl; using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { responseBytes = response.Content.ReadAsByteArrayAsync().SyncWait(); } } var parseMe = EncodingGuesser.GuessEncodingAndDecode(responseBytes, null); if (Config.DumpImageResultsFileName != null) { using (var dumpy = File.Open(Path.Combine(SharpIrcBotUtil.AppDirectory, Config.DumpImageResultsFileName), FileMode.Create, FileAccess.Write)) { dumpy.Write(responseBytes, 0, responseBytes.Length); } } var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(parseMe); IEnumerable <HtmlNode> foundHubs = htmlDoc.DocumentNode .SelectNodes(".//*") .OfType <HtmlNode>() .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_hUb")); foreach (HtmlNode foundHub in foundHubs) { IEnumerable <HtmlNode> foundGubs = foundHub .SelectNodes(".//*") .OfType <HtmlNode>() .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_gUb")); foreach (HtmlNode hint in foundGubs) { return(string.Format("{0} ({1})", typeDescription, HtmlEntity.DeEntitize(hint.InnerText))); } } return(typeDescription); } catch (AggregateException ex) when(ex.InnerException is TaskCanceledException) { // timed out return(typeDescription); } catch (Exception ex) { Logger.LogWarning("image info: {Exception}", ex); return(typeDescription); } }