public LinkAndInfo ResolveLink(LinkToResolve link) { if (link.ContentType.MediaType != "text/html" && link.ContentType.MediaType != "application/xhtml+xml") { return(null); } // HTML? parse it and get the title var respStr = EncodingGuesser.GuessEncodingAndDecode(link.ResponseBytes, link.ContentType); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(respStr); var titleElement = htmlDoc.DocumentNode.SelectSingleNode(".//title"); if (titleElement != null) { return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(titleElement.InnerText)).Trim())); } var h1Element = htmlDoc.DocumentNode.SelectSingleNode(".//h1"); if (h1Element != null) { return(link.ToResult(FetchErrorLevel.Success, FoldWhitespace(HtmlEntity.DeEntitize(h1Element.InnerText)).Trim())); } return(link.ToResult(FetchErrorLevel.Success, "(HTML without a title O_o)")); }
public LinkAndInfo ResolveLink(LinkToResolve link) { if (link.ContentType?.MediaType == null) { return(null); } string typeDescription; if (!DetectedMimeTypes.TryGetValue(link.ContentType.MediaType, out typeDescription)) { // you're not my type return(null); } string description = ResolveLinkText(link, typeDescription); return(link.ToResult(FetchErrorLevel.Success, description)); }
public LinkAndInfo ResolveLink(LinkToResolve link) { string absoluteUri = link.Link.AbsoluteUri; Match z0rMatch = Z0rUrlPattern.Match(absoluteUri); if (!z0rMatch.Success) { // can't handle this return(null); } // obtain the ID long z0rID; if (!long.TryParse(z0rMatch.Groups["id"].Value, NumberStyles.None, CultureInfo.InvariantCulture, out z0rID)) { // unparseable ID, probably too many digits return(null); } Z0rEntry entry; if (EntryCache.TryGetValue(z0rID, out entry)) { // fast-path return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry))); } Z0rRange range = RangeForID(z0rID); if (!MaxPage.HasValue) { MaxPage = ObtainMaxPageValue(); } if (!MaxPage.HasValue) { // bad return(link.ToResult( FetchErrorLevel.TransientError, string.Format(CultureInfo.InvariantCulture, "z0r #{0}; fetching index page list failed", z0rID) )); } if (range.Page > MaxPage) { // the index does not contain this page entry = new Z0rEntry(z0rID, null, null, null, null); return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry))); } LoadFromPage(range.Page); if (EntryCache.TryGetValue(z0rID, out entry)) { return(link.ToResult(FetchErrorLevel.Success, FormatEntry(entry))); } return(link.ToResult( FetchErrorLevel.TransientError, string.Format(CultureInfo.InvariantCulture, "z0r #{0}; fetching failed", z0rID) )); }
string ResolveLinkText(LinkToResolve link, string typeDescription) { try { var client = new HttpClient { Timeout = TimeSpan.FromSeconds(Config.ImageInfoTimeoutSeconds) }; var googleImageSearchUrl = new Uri(string.Format(GoogleImageSearchUrlPattern, LinkInfoConfig.GoogleDomain)); // alibi-visit the image search page to get the cookies using (var request = new HttpRequestMessage(HttpMethod.Get, googleImageSearchUrl)) { request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); request.Headers.Referrer = new Uri(string.Format(GoogleHomepageUrlPattern, LinkInfoConfig.GoogleDomain)); using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { response.Content.ReadAsByteArrayAsync().SyncWait(); } } // fetch the actual info var searchUrl = new Uri(string.Format( GoogleImageSearchByImageUrlPattern, LinkInfoConfig.GoogleDomain, link.Link.AbsoluteUri )); byte[] responseBytes; using (var request = new HttpRequestMessage(HttpMethod.Get, searchUrl)) { request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); request.Headers.Referrer = googleImageSearchUrl; using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { responseBytes = response.Content.ReadAsByteArrayAsync().SyncWait(); } } var parseMe = EncodingGuesser.GuessEncodingAndDecode(responseBytes, null); if (Config.DumpImageResultsFileName != null) { using (var dumpy = File.Open(Path.Combine(SharpIrcBotUtil.AppDirectory, Config.DumpImageResultsFileName), FileMode.Create, FileAccess.Write)) { dumpy.Write(responseBytes, 0, responseBytes.Length); } } var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(parseMe); IEnumerable <HtmlNode> foundHubs = htmlDoc.DocumentNode .SelectNodes(".//*") .OfType <HtmlNode>() .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_hUb")); foreach (HtmlNode foundHub in foundHubs) { IEnumerable <HtmlNode> foundGubs = foundHub .SelectNodes(".//*") .OfType <HtmlNode>() .Where(n => n.GetAttributeValue("class", "").Split(' ').Contains("_gUb")); foreach (HtmlNode hint in foundGubs) { return(string.Format("{0} ({1})", typeDescription, HtmlEntity.DeEntitize(hint.InnerText))); } } return(typeDescription); } catch (AggregateException ex) when(ex.InnerException is TaskCanceledException) { // timed out return(typeDescription); } catch (Exception ex) { Logger.LogWarning("image info: {Exception}", ex); return(typeDescription); } }
public LinkAndInfo ResolveLink(LinkToResolve link) { Uri theLink = link.OriginalLinkOrLink; if (theLink.Scheme != "http" && theLink.Scheme != "https") { return(null); } if (theLink.Host != TissHostname) { return(null); } if (theLink.AbsolutePath != EducationDetailsPath && theLink.AbsolutePath != CourseDetailsPath) { return(null); } string semester, courseNr; StringValues values; IDictionary <string, StringValues> queryValues = QueryHelpers.ParseQuery(theLink.Query); if (!queryValues.TryGetValue(SemesterVariable, out values)) { // make a best guess according to the current date: // * until mid-February, it's the previous year's winter semester (ends with January, next starts in March) // * until mid-August, it's the current year's summer semester (ends with July, next starts in October) // * otherwise, it's the current year's winter semester (starts in October, ends with January of next year) DateTimeOffset now = DateTimeOffset.Now; if (now.Month < 2 || (now.Month == 2 && now.Day < 15)) { values = $"{now.Year-1}W"; } else if (now.Month < 8 || (now.Month == 8 && now.Day < 15)) { values = $"{now.Year}S"; } else { values = $"{now.Year}W"; } } if (values.Count == 0) { return(null); } semester = values; if (!queryValues.TryGetValue(CourseNumberVariable, out values)) { return(null); } if (values.Count == 0) { return(null); } courseNr = values; Uri tissCoursesApiUri = new Uri(string.Format( TissCoursesApiPattern, Uri.EscapeDataString(courseNr), Uri.EscapeDataString(semester) )); XDocument doc; try { try { var client = new HttpClient { Timeout = TimeSpan.FromSeconds(LinkInfoConfig.TimeoutSeconds) }; using (var request = new HttpRequestMessage(HttpMethod.Get, tissCoursesApiUri)) { request.Headers.UserAgent.TryParseAdd(LinkInfoConfig.FakeUserAgent); using (var response = client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) using (Stream responseStream = response.Content.ReadAsStreamAsync().SyncWait()) { doc = XDocument.Load(responseStream); } } } catch (AggregateException ex) when(ex.InnerException is TaskCanceledException) { // timed out return(new LinkAndInfo(theLink, "TISS course (detail fetching timed out)", FetchErrorLevel.TransientError)); } XElement courseElement = doc .Element(CourseNamespace + "tuvienna") .Element(CourseNamespace + "course"); string realCourseNumber = courseElement .Element(CourseNamespace + "courseNumber") .Value; string courseType = courseElement .Element(CourseNamespace + "courseType") .Value; string title = courseElement .Element(CourseNamespace + "title") .Element(LangNamespace + "de") .Value; string formattedCourseNumber = realCourseNumber.Substring(0, 3) + "." + realCourseNumber.Substring(3); return(new LinkAndInfo( theLink, $"TISS course: {formattedCourseNumber} {courseType} {title}", FetchErrorLevel.Success )); } catch (Exception ex) { Logger.LogWarning("image info: {Exception}", ex); return(new LinkAndInfo(theLink, "TISS course (exception thrown)", FetchErrorLevel.TransientError)); } }
public virtual LinkAndInfo RealObtainLinkInfo([NotNull] Uri link, [CanBeNull] Uri originalLink = null, int redirectCount = 0) { // hyperrecursion? if (redirectCount > Config.MaxRedirects) { return(new LinkAndInfo(link, "(too many redirections)", FetchErrorLevel.TransientError, originalLink)); } var linkBuilder = new UriBuilder(link); linkBuilder.Fragment = ""; // check URL blacklist IPAddress[] addresses; try { linkBuilder.Host = IDNMapping.GetAscii(link.Host); addresses = Dns.GetHostAddressesAsync(linkBuilder.Host).SyncWait(); } catch (SocketException se) { Logger.LogWarning("socket exception when resolving {Host}: {Exception}", linkBuilder.Host, se); return(new LinkAndInfo(link, "(cannot resolve)", FetchErrorLevel.TransientError, originalLink)); } if (addresses.Length == 0) { Logger.LogWarning("no addresses found when resolving {Host}", linkBuilder.Host); return(new LinkAndInfo(link, "(cannot resolve)", FetchErrorLevel.TransientError, originalLink)); } if (addresses.Any(IPAddressBlacklist.IsIPAddressBlacklisted)) { return(new LinkAndInfo(link, "(I refuse to access this IP address)", FetchErrorLevel.LastingError, originalLink)); } var httpClientHandler = new HttpClientHandler { AllowAutoRedirect = false }; using (httpClientHandler) using (var httpClient = new HttpClient(httpClientHandler)) using (var request = new HttpRequestMessage(HttpMethod.Get, linkBuilder.Uri)) using (var respStore = new MemoryStream()) { var contentType = new MediaTypeHeaderValue("application/octet-stream"); httpClient.Timeout = TimeSpan.FromSeconds(Config.TimeoutSeconds); request.Headers.UserAgent.TryParseAdd(Config.FakeUserAgent); request.Headers.AcceptLanguage.TryParseAdd(Config.AcceptLanguage); using (var resp = httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead).SyncWait()) { try { // redirect? Uri location = resp.Headers.Location; if (location != null) { // go there instead Logger.LogDebug( "{AbsoluteURI} (originally {OriginalAbsoluteURI}) redirects to {Location}", link.AbsoluteUri, originalLink?.AbsoluteUri ?? link.AbsoluteUri, location ); return(RealObtainLinkInfo(new Uri(link, location), originalLink ?? link, redirectCount + 1)); } // success? if (!resp.IsSuccessStatusCode) { throw new HttpRequestException("unsuccessful"); } // find the content-type contentType = resp.Content.Headers.ContentType ?? contentType; // start timing var readTimeout = TimeSpan.FromSeconds(Config.TimeoutSeconds); var timer = new Stopwatch(); timer.Start(); // copy var buf = new byte[DownloadBufferSize]; Stream responseStream = resp.Content.ReadAsStreamAsync().SyncWait(); if (responseStream.CanTimeout) { responseStream.ReadTimeout = (int)readTimeout.TotalMilliseconds; } long totalBytesRead = 0; for (;;) { int bytesRead = responseStream.Read(buf, 0, buf.Length); if (bytesRead == 0) { break; } totalBytesRead += bytesRead; if (timer.Elapsed > readTimeout) { return(new LinkAndInfo(link, "(reading timed out)", FetchErrorLevel.TransientError, originalLink)); } if (totalBytesRead > Config.MaxDownloadSizeBytes) { return(new LinkAndInfo(link, "(file too large)", FetchErrorLevel.LastingError, originalLink)); } respStore.Write(buf, 0, bytesRead); } } catch (HttpRequestException we) { if (resp != null) { return(new LinkAndInfo(link, $"(HTTP {resp.StatusCode})", FetchErrorLevel.TransientError, originalLink)); } Logger.LogWarning("HTTP exception thrown: {Exception}", we); return(new LinkAndInfo(link, "(HTTP error)", FetchErrorLevel.TransientError, originalLink)); } } var linkToResolve = new LinkToResolve(link, originalLink, respStore.ToArray(), contentType); foreach (ILinkResolverPlugin plugin in Plugins) { LinkAndInfo ret = plugin.ResolveLink(linkToResolve); if (ret != null) { return(ret); } } // fallback switch (contentType.MediaType) { case "application/octet-stream": return(new LinkAndInfo(link, "(can't figure out the content type, sorry)", FetchErrorLevel.LastingError, originalLink)); case "text/html": case "application/xhtml+xml": return(new LinkAndInfo(link, "HTML", FetchErrorLevel.Success, originalLink)); case "image/png": return(new LinkAndInfo(link, "PNG image", FetchErrorLevel.Success, originalLink)); case "image/jpeg": return(new LinkAndInfo(link, "JPEG image", FetchErrorLevel.Success, originalLink)); case "image/gif": return(new LinkAndInfo(link, "GIF image", FetchErrorLevel.Success, originalLink)); case "application/json": return(new LinkAndInfo(link, "JSON", FetchErrorLevel.Success, originalLink)); case "text/xml": case "application/xml": return(new LinkAndInfo(link, "XML", FetchErrorLevel.Success, originalLink)); default: return(new LinkAndInfo(link, $"file of type {contentType.MediaType}", FetchErrorLevel.Success, originalLink)); } } }