internal static SomeNodeElement[] GetAllImagesUrlsWithMinSize(SomeNodeElement[] items, System.Drawing.Size minSize) { return GetImageSizes(items).Where( n => Helper.CheckImageSize(n.Value, minSize, true)).Select( n => n.Key).ToArray(); }
internal static Dictionary<SomeNodeElement, System.Drawing.Size> GetImageSizes(SomeNodeElement[] items) { Dictionary<SomeNodeElement, System.Drawing.Size> result = new Dictionary<SomeNodeElement, Size>(); foreach(var n in items) { System.Drawing.Size sz; int width, height; if (n.Node.HasAttributes && n.Node.Attributes.Contains("Width") && n.Node.Attributes.Contains("Height") && int.TryParse(n.Node.Attributes["Width"].Value, out width) && int.TryParse(n.Node.Attributes["Height"].Value, out height) ) sz = new System.Drawing.Size() { Width = width, Height = height }; else sz = Helper.GetImageSize(n.Url.AbsoluteUri, true); result.Add(n, sz); } return result; }
internal static SomeNodeElement[] GetAllImagesUrlsFromUrl(HtmlAgilityPack.HtmlDocument document, string responseUrl, bool collectIMGTags, bool collectLINKTags, bool collectMETATags, Func<string,bool> additionalAttributeAsLink) { if (document == null) throw new ArgumentNullException("В функции GetAllImagesUrlsFromUrl() не может отсутствовать обязательный параметр document"); var symbolsDict = new Dictionary<string, string>() { { "&", "&" }, { "<", "<" }, { ">;", ">" }, { "§", "§" }, { "©", "©" }, { "®", "®" }, { "°", "°" }, { "«", "«" }, { "»", "»" }, { "·", "·" }, { "™", "™" }, { "±", "±" }, }; var fixFunc = new Func<string, string>((url) => { foreach (var key in symbolsDict.Keys) url = url.Replace(key, symbolsDict[key]); return url; }); bool wasException = false; var logSession = Helpers.Old.Log.SessionStart("Additional.GetAllImagesUrlsFromUrl()", true); try { var allAdditionalLinks = additionalAttributeAsLink == null ? new SomeNodeElement[] {} : document .DocumentNode .Descendants() .SelectMany(n => n.Attributes.Select(a => new { Node = n, AttributeValue = fixFunc(a.Value) })) .Where(i => additionalAttributeAsLink(i.AttributeValue)) .Select(i => new SomeNodeElement() { Node = i.Node, Url = Helper.GetFullSourceLink(i.AttributeValue, document, responseUrl) }) .ToArray(); var allIMGLinks = !collectIMGTags ? new SomeNodeElement[] {} : document .DocumentNode .Descendants("img") .Where(n => n.Attributes.Contains("src") && Helper.IsWellFormedUriString(n.Attributes["src"].Value, UriKind.RelativeOrAbsolute)) .Select(n => new SomeNodeElement() { Node = n, Url = Helper.GetFullSourceLink(fixFunc(n.Attributes["src"].Value), document, responseUrl) }) .ToArray(); //add some links (<a href="img_source"/>) var allLINKLinks = !collectLINKTags ? new SomeNodeElement[] { } : document .DocumentNode .Descendants("a") .Where(n => n.Attributes.Contains("href")) .Where(n => { string href = n.Attributes["href"].Value; string[] likes = new string[] { "*.jpeg", "*.jpg", "*.bmp", "*.gif", "*.png" }; return likes.Any(i => Helper.StringLikes(href, i)) && Helper.IsWellFormedUriString(href, UriKind.RelativeOrAbsolute); }) .Select(n => new SomeNodeElement() { Node = n, Url = Helper.GetFullSourceLink(fixFunc(n.Attributes["href"].Value), document, responseUrl) }) .ToArray(); var allMETALinks = new SomeNodeElement[] { }; if (collectMETATags) if (document.DocumentNode != null && document.DocumentNode.FirstChild != null && !string.IsNullOrWhiteSpace(document.DocumentNode.InnerHtml)) { string regExString = "(https?:)?//?[^\'\"<>]+?\\.(jpg|jpeg|gif|png|bmp)";// @"/((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$"; foreach (Match m in Regex.Matches(document.DocumentNode.InnerHtml, regExString)) //"(\\S+?)\\.(jpg|png|gif|jpeg|bmp)" { string value = m.Value; while (value.IndexOf("(") >= 0) value = value.Substring(value.IndexOf("(")+1); string http = "http:"; if (value.Like("*" + http + "*")) value = value.Substring(value.IndexOf(http)); string https = "https:"; if (value.Like("*" + https + "*")) value = value.Substring(value.IndexOf(https)); value = value.Replace(@"\/","/"); if (Helper.IsWellFormedUriString(value, UriKind.RelativeOrAbsolute)) try { Uri newUri = Helper.GetFullSourceLink(fixFunc(value), document, responseUrl); allMETALinks = allMETALinks.Union(new SomeNodeElement[] { new SomeNodeElement() { Node = document.DocumentNode, Url = newUri } }).ToArray(); } catch(Exception) { } } } var allLinks = allIMGLinks .Union(allLINKLinks) .Union(allMETALinks) .Union(allAdditionalLinks) //.Distinct() .GroupBy(ne => ne.Url) .Select(neg => neg.First()) .ToArray(); if (allLinks.Length == 0 && string.IsNullOrWhiteSpace(document.DocumentNode.InnerText)) allLinks = new SomeNodeElement[] { new SomeNodeElement() { Node = document.DocumentNode, Url = new Uri(responseUrl) } }; //List<SomeNodeElement> distinctedLinks = new List<SomeNodeElement>(allLinks); //for (int i = distinctedLinks.Count - 1; i >= 0; i--) //{ // int lnToDel = distinctedLinks.Count(i2 => i2.Url.AbsoluteUri == distinctedLinks[i].Url.AbsoluteUri && i2 != distinctedLinks[i]); // if (lnToDel > 0) // distinctedLinks.RemoveAt(i); //} //return distinctedLinks.ToArray(); return allLinks; } catch(Exception ex) { wasException = true; Helpers.Old.Log.Add(logSession, ex); throw ex; } finally { Helpers.Old.Log.SessionEnd(logSession, wasException); } }