public LinkAndInfo ResolveLink(LinkToResolve link)
        {
            if (link.ContentType.MediaType != "text/html" && link.ContentType.MediaType != "application/xhtml+xml")
            {
                return(null);
            }

            // HTML? parse it and get the title
            var respStr = EncodingGuesser.GuessEncodingAndDecode(link.ResponseBytes, link.ContentType);

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(respStr);
            var titleElement = htmlDoc.DocumentNode.SelectSingleNode(".//title");

            if (titleElement != null)
            {
                return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(titleElement.InnerText)).Trim()));
            }
            var h1Element = htmlDoc.DocumentNode.SelectSingleNode(".//h1");

            if (h1Element != null)
            {
                return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(h1Element.InnerText)).Trim()));
            }
            return(link.ToResult(FetchErrorLevel.Success, "(HTML without a title O_o)"));
        }
Example #2
0
        public LinkAndInfo ResolveLink(LinkToResolve link)
        {
            if (link.ContentType?.MediaType == null)
            {
                return(null);
            }

            string typeDescription;

            if (!DetectedMimeTypes.TryGetValue(link.ContentType.MediaType, out typeDescription))
            {
                // you're not my type
                return(null);
            }

            string description = ResolveLinkText(link, typeDescription);

            return(link.ToResult(FetchErrorLevel.Success, description));
        }
Example #3
0
        public LinkAndInfo ResolveLink(LinkToResolve link)
        {
            string absoluteUri = link.Link.AbsoluteUri;
            Match  z0rMatch    = Z0rUrlPattern.Match(absoluteUri);

            if (!z0rMatch.Success)
            {
                // can't handle this
                return(null);
            }

            // obtain the ID
            long z0rID;

            if (!long.TryParse(z0rMatch.Groups["id"].Value, NumberStyles.None, CultureInfo.InvariantCulture, out z0rID))
            {
                // unparseable ID, probably too many digits
                return(null);
            }

            Z0rEntry entry;

            if (EntryCache.TryGetValue(z0rID, out entry))
            {
                // fast-path
                return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry)));
            }

            Z0rRange range = RangeForID(z0rID);

            if (!MaxPage.HasValue)
            {
                MaxPage = ObtainMaxPageValue();
            }

            if (!MaxPage.HasValue)
            {
                // bad
                return(link.ToResult(
                           FetchErrorLevel.TransientError,
                           string.Format(CultureInfo.InvariantCulture, "z0r #{0}; fetching index page list failed", z0rID)
                           ));
            }

            if (range.Page > MaxPage)
            {
                // the index does not contain this page
                entry = new Z0rEntry(z0rID, null, null, null, null);
                return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry)));
            }

            LoadFromPage(range.Page);

            if (EntryCache.TryGetValue(z0rID, out entry))
            {
                return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry)));
            }

            return(link.ToResult(
                       FetchErrorLevel.TransientError,
                       string.Format(CultureInfo.InvariantCulture, "z0r #{0}; fetching failed", z0rID)
                       ));
        }
Example #4
0
        string ResolveLinkText(LinkToResolve link, string typeDescription)
        {
            try
            {
                var client = new HttpClient
                {
                    Timeout = TimeSpan.FromSeconds(Config.ImageInfoTimeoutSeconds)
                };

                var googleImageSearchUrl = new Uri(string.Format(GoogleImageSearchUrlPattern, LinkInfoConfig.GoogleDomain));

                // alibi-visit the image search page to get the cookies
                using (var request = new HttpRequestMessage(HttpMethod.Get, googleImageSearchUrl))
                {
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);
                    request.Headers.Referrer = new Uri(string.Format(GoogleHomepageUrlPattern, LinkInfoConfig.GoogleDomain));

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }

                // fetch the actual info
                var searchUrl = new Uri(string.Format(
                                            GoogleImageSearchByImageUrlPattern,
                                            LinkInfoConfig.GoogleDomain,
                                            link.Link.AbsoluteUri
                                            ));
                byte[] responseBytes;
                using (var request = new HttpRequestMessage(HttpMethod.Get, searchUrl))
                {
                    request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);
                    request.Headers.Referrer = googleImageSearchUrl;

                    using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                    {
                        responseBytes = response.Content.ReadAsByteArrayAsync().SyncWait();
                    }
                }
                var parseMe = EncodingGuesser.GuessEncodingAndDecode(responseBytes, null);

                if (Config.DumpImageResultsFileName != null)
                {
                    using (var dumpy = File.Open(Path.Combine(SharpIrcBotUtil.AppDirectory, Config.DumpImageResultsFileName), FileMode.Create, FileAccess.Write))
                    {
                        dumpy.Write(responseBytes, 0, responseBytes.Length);
                    }
                }

                var htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(parseMe);
                IEnumerable <HtmlNode> foundHubs = htmlDoc.DocumentNode
                                                   .SelectNodes(".//*")
                                                   .OfType <HtmlNode>()
                                                   .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_hUb"));
                foreach (HtmlNode foundHub in foundHubs)
                {
                    IEnumerable <HtmlNode> foundGubs = foundHub
                                                       .SelectNodes(".//*")
                                                       .OfType <HtmlNode>()
                                                       .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_gUb"));
                    foreach (HtmlNode hint in foundGubs)
                    {
                        return(string.Format("{0} ({1})", typeDescription, HtmlEntity.DeEntitize(hint.InnerText)));
                    }
                }
                return(typeDescription);
            }
            catch (AggregateException ex) when(ex.InnerException is TaskCanceledException)
            {
                // timed out
                return(typeDescription);
            }
            catch (Exception ex)
            {
                Logger.LogWarning("image info: {Exception}", ex);
                return(typeDescription);
            }
        }
Example #5
0
        public LinkAndInfo ResolveLink(LinkToResolve link)
        {
            Uri theLink = link.OriginalLinkOrLink;

            if (theLink.Scheme != "http" && theLink.Scheme != "https")
            {
                return(null);
            }

            if (theLink.Host != TissHostname)
            {
                return(null);
            }

            if (theLink.AbsolutePath != EducationDetailsPath && theLink.AbsolutePath != CourseDetailsPath)
            {
                return(null);
            }

            string       semester, courseNr;
            StringValues values;
            IDictionary <string, StringValues> queryValues = QueryHelpers.ParseQuery(theLink.Query);

            if (!queryValues.TryGetValue(SemesterVariable, out values))
            {
                // make a best guess according to the current date:
                // * until mid-February, it's the previous year's winter semester (ends with January, next starts in March)
                // * until mid-August, it's the current year's summer semester (ends with July, next starts in October)
                // * otherwise, it's the current year's winter semester (starts in October, ends with January of next year)

                DateTimeOffset now = DateTimeOffset.Now;
                if (now.Month < 2 || (now.Month == 2 && now.Day < 15))
                {
                    values = $"{now.Year-1}W";
                }
                else if (now.Month < 8 || (now.Month == 8 && now.Day < 15))
                {
                    values = $"{now.Year}S";
                }
                else
                {
                    values = $"{now.Year}W";
                }
            }
            if (values.Count == 0)
            {
                return(null);
            }
            semester = values;

            if (!queryValues.TryGetValue(CourseNumberVariable, out values))
            {
                return(null);
            }
            if (values.Count == 0)
            {
                return(null);
            }
            courseNr = values;

            Uri tissCoursesApiUri = new Uri(string.Format(
                                                TissCoursesApiPattern,
                                                Uri.EscapeDataString(courseNr),
                                                Uri.EscapeDataString(semester)
                                                ));

            XDocument doc;

            try
            {
                try
                {
                    var client = new HttpClient
                    {
                        Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds)
                    };

                    using (var request = new HttpRequestMessage(HttpMethod.Get, tissCoursesApiUri))
                    {
                        request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent);

                        using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                            using (Stream responseStream = response.Content.ReadAsStreamAsync().SyncWait())
                            {
                                doc = XDocument.Load(responseStream);
                            }
                    }
                }
                catch (AggregateException ex) when(ex.InnerException is TaskCanceledException)
                {
                    // timed out
                    return(new LinkAndInfo(theLink, "TISS course (detail fetching timed out)", FetchErrorLevel.TransientError));
                }

                XElement courseElement = doc
                                         .Element(CourseNamespace + "tuvienna")
                                         .Element(CourseNamespace + "course");
                string realCourseNumber = courseElement
                                          .Element(CourseNamespace + "courseNumber")
                                          .Value;
                string courseType = courseElement
                                    .Element(CourseNamespace + "courseType")
                                    .Value;
                string title = courseElement
                               .Element(CourseNamespace + "title")
                               .Element(LangNamespace + "de")
                               .Value;

                string formattedCourseNumber = realCourseNumber.Substring(0, 3) + "." + realCourseNumber.Substring(3);

                return(new LinkAndInfo(
                           theLink, $"TISS course: {formattedCourseNumber} {courseType} {title}", FetchErrorLevel.Success
                           ));
            }
            catch (Exception ex)
            {
                Logger.LogWarning("image info: {Exception}", ex);
                return(new LinkAndInfo(theLink, "TISS course (exception thrown)", FetchErrorLevel.TransientError));
            }
        }
Example #6
0
        public virtual LinkAndInfo RealObtainLinkInfo([NotNull] Uri link, [CanBeNull] Uri originalLink = null, int redirectCount = 0)
        {
            // hyperrecursion?
            if (redirectCount > Config.MaxRedirects)
            {
                return(new LinkAndInfo(link, "(too many redirections)", FetchErrorLevel.TransientError, originalLink));
            }

            var linkBuilder = new UriBuilder(link);

            linkBuilder.Fragment = "";

            // check URL blacklist
            IPAddress[] addresses;
            try
            {
                linkBuilder.Host = IDNMapping.GetAscii(link.Host);
                addresses        = Dns.GetHostAddressesAsync(linkBuilder.Host).SyncWait();
            }
            catch (SocketException se)
            {
                Logger.LogWarning("socket exception when resolving {Host}: {Exception}", linkBuilder.Host, se);
                return(new LinkAndInfo(link, "(cannot resolve)", FetchErrorLevel.TransientError, originalLink));
            }

            if (addresses.Length == 0)
            {
                Logger.LogWarning("no addresses found when resolving {Host}", linkBuilder.Host);
                return(new LinkAndInfo(link, "(cannot resolve)", FetchErrorLevel.TransientError, originalLink));
            }
            if (addresses.Any(IPAddressBlacklist.IsIPAddressBlacklisted))
            {
                return(new LinkAndInfo(link, "(I refuse to access this IP address)", FetchErrorLevel.LastingError, originalLink));
            }

            var httpClientHandler = new HttpClientHandler
            {
                AllowAutoRedirect = false
            };

            using (httpClientHandler)
                using (var httpClient = new HttpClient(httpClientHandler))
                    using (var request = new HttpRequestMessage(HttpMethod.Get, linkBuilder.Uri))
                        using (var respStore = new MemoryStream())
                        {
                            var contentType = new MediaTypeHeaderValue("application/octet-stream");

                            httpClient.Timeout = TimeSpan.FromSeconds(Config.TimeoutSeconds);
                            request.Headers.UserAgent.TryParseAdd(Config.FakeUserAgent);
                            request.Headers.AcceptLanguage.TryParseAdd(Config.AcceptLanguage);

                            using (var resp = httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait())
                            {
                                try
                                {
                                    // redirect?
                                    Uri location = resp.Headers.Location;
                                    if (location != null)
                                    {
                                        // go there instead
                                        Logger.LogDebug(
                                            "{AbsoluteURI} (originally {OriginalAbsoluteURI}) redirects to {Location}",
                                            link.AbsoluteUri, originalLink?.AbsoluteUri ?? link.AbsoluteUri, location
                                            );
                                        return(RealObtainLinkInfo(new Uri(link, location), originalLink ?? link, redirectCount + 1));
                                    }

                                    // success?
                                    if (!resp.IsSuccessStatusCode)
                                    {
                                        throw new HttpRequestException("unsuccessful");
                                    }

                                    // find the content-type
                                    contentType = resp.Content.Headers.ContentType ?? contentType;

                                    // start timing
                                    var readTimeout = TimeSpan.FromSeconds(Config.TimeoutSeconds);
                                    var timer       = new Stopwatch();
                                    timer.Start();

                                    // copy
                                    var    buf            = new byte[DownloadBufferSize];
                                    Stream responseStream = resp.Content.ReadAsStreamAsync().SyncWait();
                                    if (responseStream.CanTimeout)
                                    {
                                        responseStream.ReadTimeout = (int)readTimeout.TotalMilliseconds;
                                    }
                                    long totalBytesRead = 0;
                                    for (;;)
                                    {
                                        int bytesRead = responseStream.Read(buf, 0, buf.Length);
                                        if (bytesRead == 0)
                                        {
                                            break;
                                        }
                                        totalBytesRead += bytesRead;
                                        if (timer.Elapsed > readTimeout)
                                        {
                                            return(new LinkAndInfo(link, "(reading timed out)", FetchErrorLevel.TransientError, originalLink));
                                        }
                                        if (totalBytesRead > Config.MaxDownloadSizeBytes)
                                        {
                                            return(new LinkAndInfo(link, "(file too large)", FetchErrorLevel.LastingError, originalLink));
                                        }
                                        respStore.Write(buf, 0, bytesRead);
                                    }
                                }
                                catch (HttpRequestException we)
                                {
                                    if (resp != null)
                                    {
                                        return(new LinkAndInfo(link, $"(HTTP {resp.StatusCode})", FetchErrorLevel.TransientError, originalLink));
                                    }
                                    Logger.LogWarning("HTTP exception thrown: {Exception}", we);
                                    return(new LinkAndInfo(link, "(HTTP error)", FetchErrorLevel.TransientError, originalLink));
                                }
                            }

                            var linkToResolve = new LinkToResolve(link, originalLink, respStore.ToArray(), contentType);
                            foreach (ILinkResolverPlugin plugin in Plugins)
                            {
                                LinkAndInfo ret = plugin.ResolveLink(linkToResolve);
                                if (ret != null)
                                {
                                    return(ret);
                                }
                            }

                            // fallback
                            switch (contentType.MediaType)
                            {
                            case "application/octet-stream":
                                return(new LinkAndInfo(link, "(can't figure out the content type, sorry)", FetchErrorLevel.LastingError, originalLink));

                            case "text/html":
                            case "application/xhtml+xml":
                                return(new LinkAndInfo(link, "HTML", FetchErrorLevel.Success, originalLink));

                            case "image/png":
                                return(new LinkAndInfo(link, "PNG image", FetchErrorLevel.Success, originalLink));

                            case "image/jpeg":
                                return(new LinkAndInfo(link, "JPEG image", FetchErrorLevel.Success, originalLink));

                            case "image/gif":
                                return(new LinkAndInfo(link, "GIF image", FetchErrorLevel.Success, originalLink));

                            case "application/json":
                                return(new LinkAndInfo(link, "JSON", FetchErrorLevel.Success, originalLink));

                            case "text/xml":
                            case "application/xml":
                                return(new LinkAndInfo(link, "XML", FetchErrorLevel.Success, originalLink));

                            default:
                                return(new LinkAndInfo(link, $"file of type {contentType.MediaType}", FetchErrorLevel.Success, originalLink));
                            }
                        }
        }