public static string ObtainImageInfo(Uri url, string text) { try { var client = new CookieWebClient(); client.Headers[HttpRequestHeader.UserAgent] = FakeUserAgent; // alibi-visit the image search page to get the cookies client.Headers[HttpRequestHeader.Referer] = GoogleHomepageUrl.ToString(); client.DownloadData(GoogleImageSearchUrl); // fetch the actual info var searchUrl = new Uri(string.Format( GoogleImageSearchByImageUrlPattern, Util.UrlEncode(url.ToString(), Util.Utf8NoBom, true) )); client.Headers[HttpRequestHeader.Referer] = GoogleImageSearchUrl.ToString(); var responseBytes = client.DownloadData(searchUrl); var parseMe = EncodingGuesser.GuessEncodingAndDecode(responseBytes, null, null); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(parseMe); var foundHints = htmlDoc.DocumentNode.QuerySelectorAll(".qb-bmqc .qb-b"); foreach (var hint in foundHints) { return(string.Format("{0} ({1})", text, HtmlEntity.DeEntitize(hint.InnerText))); } return(text); } catch (Exception ex) { Logger.Warn("image info", ex); return(text); } }
public static string RealObtainLinkInfo(Uri link) { var lowerUrl = link.ToString().ToLowerInvariant(); if (!lowerUrl.StartsWith("http://") && !lowerUrl.StartsWith("https://")) { return("(I only access HTTP and HTTPS URLs)"); } // check URL blacklist var addresses = Dns.GetHostAddresses(link.Host); if (addresses.Length == 0) { return("(cannot resolve)"); } if (addresses.Any(IPAddressBlacklist.IsIPAddressBlacklisted)) { return("(I refuse to access this IP address)"); } var request = WebRequest.Create(link); using (var respStore = new MemoryStream()) { var contentType = "application/octet-stream"; string contentTypeHeader = null; string responseCharacterSet = null; request.Timeout = 5000; try { var resp = request.GetResponse(); // find the content-type contentTypeHeader = resp.Headers[HttpResponseHeader.ContentType]; if (contentTypeHeader != null) { contentType = contentTypeHeader.Split(';')[0]; } var webResp = resp as HttpWebResponse; responseCharacterSet = (webResp != null) ? webResp.CharacterSet : null; // copy resp.GetResponseStream().CopyTo(respStore); } catch (WebException we) { var httpResponse = we.Response as HttpWebResponse; return(string.Format("(HTTP {0})", httpResponse != null ? httpResponse.StatusCode.ToString() : "error")); } switch (contentType) { case "application/octet-stream": return("(can't figure out the content type, sorry)"); case "text/html": case "application/xhtml+xml": // HTML? parse it and get the title var respStr = EncodingGuesser.GuessEncodingAndDecode(respStore.ToArray(), responseCharacterSet, contentTypeHeader); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(respStr); var titleElement = htmlDoc.DocumentNode.SelectSingleNode(".//title"); if (titleElement != null) { return(HtmlEntity.DeEntitize(titleElement.InnerText)); } var h1Element = htmlDoc.DocumentNode.SelectSingleNode(".//h1"); if (h1Element != null) { return(HtmlEntity.DeEntitize(h1Element.InnerText)); } return("(HTML without a title O_o)"); case "image/png": return(ObtainImageInfo(link, "PNG image")); case "image/jpeg": return(ObtainImageInfo(link, "JPEG image")); case "image/gif": return(ObtainImageInfo(link, "GIF image")); case "application/json": return("JSON"); case "text/xml": case "application/xml": return("XML"); default: return(string.Format("file of type {0}", contentType)); } } }