public LinkAndInfo ResolveLink(LinkToResolve link)
        {
            if (link.ContentType.MediaType != "text/html" && link.ContentType.MediaType != "application/xhtml+xml")
            {
                return(null);
            }

            // HTML? parse it and get the title
            var respStr = EncodingGuesser.GuessEncodingAndDecode(link.ResponseBytes, link.ContentType);

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(respStr);
            var titleElement = htmlDoc.DocumentNode.SelectSingleNode(".//title");

            if (titleElement != null)
            {
                return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(titleElement.InnerText)).Trim()));
            }
            var h1Element = htmlDoc.DocumentNode.SelectSingleNode(".//h1");

            if (h1Element != null)
            {
                return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(h1Element.InnerText)).Trim()));
            }
            return(link.ToResult(FetchErrorLevel.Success, "(HTML without a title O_o)"));
        }
Example #2
0
        void LoadFromPage(long page)
        {
            var pageUri = new Uri(string.Format(CultureInfo.InvariantCulture, Z0rIndexUriFormat, page));

            byte[] indexPageBytes;
            using (var client = new HttpClient())
                using (var request = new HttpRequestMessage(HttpMethod.Get, pageUri))
                {
                    client.Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds);
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        indexPageBytes = response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }
            string indexPageString = EncodingGuesser.GuessEncodingAndDecode(indexPageBytes, null);

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(indexPageString);

            HtmlNode indexTable = htmlDoc.GetElementbyId("zebra");
            IEnumerable <HtmlNode> foundRows = indexTable
                                               .SelectNodes(".//tr")
                                               .OfType <HtmlNode>();

            foreach (HtmlNode foundRow in foundRows)
            {
                List <HtmlNode> cells = foundRow
                                        .ChildNodes
                                        .OfType <HtmlNode>()
                                        .Where(n => n.Name == "th")
                                        .ToList();
                if (cells.Count != 5)
                {
                    continue;
                }

                string idString = TrimmedInnerTextOrNull(cells[0]);

                long id;
                if (!long.TryParse(idString, NumberStyles.None, CultureInfo.InvariantCulture, out id))
                {
                    continue;
                }

                string artist = TrimmedInnerTextOrNull(cells[1]);
                string song   = TrimmedInnerTextOrNull(cells[2]);
                string image  = TrimmedInnerTextOrNull(cells[3]);
                string tag    = TrimmedInnerTextOrNull(cells[4]);

                EntryCache[id] = new Z0rEntry(id, artist, song, image, tag);
            }
        }
Example #3
0
        long?ObtainMaxPageValue()
        {
            // obtain the index homepage
            byte[] indexHomepageBytes;
            using (var client = new HttpClient())
                using (var request = new HttpRequestMessage(HttpMethod.Get, Z0rIndexHomepageUri))
                {
                    client.Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds);
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        indexHomepageBytes = response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }
            string indexHomepageString = EncodingGuesser.GuessEncodingAndDecode(indexHomepageBytes, null);

            long?currentMaxPage = null;

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(indexHomepageString);
            IEnumerable <Match> foundLinkMatches = htmlDoc.DocumentNode
                                                   .SelectNodes(".//a")
                                                   .OfType <HtmlNode>()
                                                   .Select(n => PageHrefPattern.Match(n.GetAttributeValue("href", "")))
                                                   .Where(m => m.Success);

            foreach (Match foundLinkMatch in foundLinkMatches)
            {
                long page;
                if (!long.TryParse(foundLinkMatch.Groups["page"].Value, NumberStyles.None, CultureInfo.InvariantCulture, out page))
                {
                    continue;
                }

                if (!currentMaxPage.HasValue || currentMaxPage.Value < page)
                {
                    currentMaxPage = page;
                }
            }
            return(currentMaxPage);
        }
Example #4
0
        string ResolveLinkText(LinkToResolve link, string typeDescription)
        {
            try
            {
                var client = new HttpClient
                {
                    Timeout = TimeSpan.FromSeconds(Config.ImageInfoTimeoutSeconds)
                };

                var googleImageSearchUrl = new Uri(string.Format(GoogleImageSearchUrlPattern, LinkInfoConfig.GoogleDomain));

                // alibi-visit the image search page to get the cookies
                using (var request = new HttpRequestMessage(HttpMethod.Get, googleImageSearchUrl))
                {
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);
                    request.Headers.Referrer = new Uri(string.Format(GoogleHomepageUrlPattern, LinkInfoConfig.GoogleDomain));

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }

                // fetch the actual info
                var searchUrl = new Uri(string.Format(
                                            GoogleImageSearchByImageUrlPattern,
                                            LinkInfoConfig.GoogleDomain,
                                            link.Link.AbsoluteUri
                                            ));
                byte[] responseBytes;
                using (var request = new HttpRequestMessage(HttpMethod.Get, searchUrl))
                {
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);
                    request.Headers.Referrer = googleImageSearchUrl;

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        responseBytes = response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }
                var parseMe = EncodingGuesser.GuessEncodingAndDecode(responseBytes, null);

                if (Config.DumpImageResultsFileName != null)
                {
                    using (var dumpy = File.Open(Path.Combine(SharpIrcBotUtil.AppDirectory, Config.DumpImageResultsFileName), FileMode.Create, FileAccess.Write))
                    {
                        dumpy.Write(responseBytes, 0, responseBytes.Length);
                    }
                }

                var htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(parseMe);
                IEnumerable <HtmlNode> foundHubs = htmlDoc.DocumentNode
                                                   .SelectNodes(".//*")
                                                   .OfType <HtmlNode>()
                                                   .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_hUb"));
                foreach (HtmlNode foundHub in foundHubs)
                {
                    IEnumerable <HtmlNode> foundGubs = foundHub
                                                       .SelectNodes(".//*")
                                                       .OfType <HtmlNode>()
                                                       .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_gUb"));
                    foreach (HtmlNode hint in foundGubs)
                    {
                        return(string.Format("{0} ({1})", typeDescription, HtmlEntity.DeEntitize(hint.InnerText)));
                    }
                }
                return(typeDescription);
            }
            catch (AggregateException ex) when(ex.InnerException is TaskCanceledException)
            {
                // timed out
                return(typeDescription);
            }
            catch (Exception ex)
            {
                Logger.LogWarning("image info: {Exception}", ex);
                return(typeDescription);
            }
        }