Пример #1
0
 internal static SomeNodeElement[] GetAllImagesUrlsWithMinSize(SomeNodeElement[] items, System.Drawing.Size minSize)
 {
     return GetImageSizes(items).Where( n => Helper.CheckImageSize(n.Value, minSize, true)).Select( n => n.Key).ToArray();
 }
Пример #2
0
        internal static Dictionary<SomeNodeElement, System.Drawing.Size> GetImageSizes(SomeNodeElement[] items)
        {
            Dictionary<SomeNodeElement, System.Drawing.Size> result = new Dictionary<SomeNodeElement, Size>();
            foreach(var n in items)
            {
                System.Drawing.Size sz;

                int width, height;
                if (n.Node.HasAttributes
                    && n.Node.Attributes.Contains("Width")
                    && n.Node.Attributes.Contains("Height")
                    && int.TryParse(n.Node.Attributes["Width"].Value, out width)
                    && int.TryParse(n.Node.Attributes["Height"].Value, out height)
                    )
                    sz = new System.Drawing.Size() { Width = width, Height = height };
                else
                    sz = Helper.GetImageSize(n.Url.AbsoluteUri, true);

                result.Add(n, sz);
            }
            return result;
        }
Пример #3
0
        internal static SomeNodeElement[] GetAllImagesUrlsFromUrl(HtmlAgilityPack.HtmlDocument document, string responseUrl, 
            bool collectIMGTags, bool collectLINKTags, bool collectMETATags,
            Func<string,bool> additionalAttributeAsLink)
        {
            if (document == null)
                throw new ArgumentNullException("В функции GetAllImagesUrlsFromUrl() не может отсутствовать обязательный параметр document");

            var symbolsDict = new Dictionary<string, string>() {
                    { "&amp;", "&" },
                    { "&lt;", "<" },
                    { "&gt;;", ">" },
                    { "&sect;", "§" },
                    { "&copy;", "©" },
                    { "&reg;", "®" },
                    { "&deg;", "°" },
                    { "&laquo;", "«" },
                    { "&raquo;", "»" },
                    { "&middot;", "·" },
                    { "&trade;", "™" },
                    { "&plusmn;", "±" },
                };
            var fixFunc = new Func<string, string>((url) =>
            {
                foreach (var key in symbolsDict.Keys)
                    url = url.Replace(key, symbolsDict[key]);
                return url;
            });

            bool wasException = false;
            var logSession = Helpers.Old.Log.SessionStart("Additional.GetAllImagesUrlsFromUrl()", true);
            try {
                var allAdditionalLinks = additionalAttributeAsLink == null
                    ? new SomeNodeElement[] {}
                    : document
                        .DocumentNode
                        .Descendants()
                        .SelectMany(n => n.Attributes.Select(a => new { Node = n, AttributeValue = fixFunc(a.Value) }))
                        .Where(i => additionalAttributeAsLink(i.AttributeValue))
                        .Select(i => new SomeNodeElement() { Node = i.Node, Url = Helper.GetFullSourceLink(i.AttributeValue, document, responseUrl) })
                        .ToArray();

                var allIMGLinks =
                        !collectIMGTags ? new SomeNodeElement[] {} :
                        document
                        .DocumentNode
                        .Descendants("img")
                        .Where(n =>
                            n.Attributes.Contains("src") &&
                            Helper.IsWellFormedUriString(n.Attributes["src"].Value, UriKind.RelativeOrAbsolute))
                        .Select(n => new SomeNodeElement() { Node = n, Url = Helper.GetFullSourceLink(fixFunc(n.Attributes["src"].Value), document, responseUrl) })
                        .ToArray();

                //add some links (<a href="img_source"/>)
                var allLINKLinks =
                        !collectLINKTags ? new SomeNodeElement[] { } :
                        document
                        .DocumentNode
                        .Descendants("a")
                        .Where(n => n.Attributes.Contains("href"))
                        .Where(n =>
                        {
                            string href = n.Attributes["href"].Value;
                            string[] likes = new string[] { "*.jpeg", "*.jpg",  "*.bmp", "*.gif", "*.png" };
                            return likes.Any(i => Helper.StringLikes(href, i)) && Helper.IsWellFormedUriString(href, UriKind.RelativeOrAbsolute);
                        })
                        .Select(n => new SomeNodeElement()
                        {
                            Node = n,
                            Url = Helper.GetFullSourceLink(fixFunc(n.Attributes["href"].Value), document, responseUrl)
                        })
                        .ToArray();

                var allMETALinks = new SomeNodeElement[] { };

                if (collectMETATags)
                if (document.DocumentNode != null && document.DocumentNode.FirstChild != null && !string.IsNullOrWhiteSpace(document.DocumentNode.InnerHtml))
                {
                    string regExString = "(https?:)?//?[^\'\"<>]+?\\.(jpg|jpeg|gif|png|bmp)";// @"/((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$";
                    foreach (Match m in Regex.Matches(document.DocumentNode.InnerHtml, regExString)) //"(\\S+?)\\.(jpg|png|gif|jpeg|bmp)"
                    {
                        string value = m.Value;
                        while (value.IndexOf("(") >= 0)
                            value = value.Substring(value.IndexOf("(")+1);

                        string http = "http:";
                        if (value.Like("*" + http + "*"))
                            value = value.Substring(value.IndexOf(http));

                        string https = "https:";
                        if (value.Like("*" + https + "*"))
                            value = value.Substring(value.IndexOf(https));

                        value = value.Replace(@"\/","/");

                        if (Helper.IsWellFormedUriString(value, UriKind.RelativeOrAbsolute))
                            try
                            {
                                Uri newUri = Helper.GetFullSourceLink(fixFunc(value), document, responseUrl);
                                    allMETALinks = allMETALinks.Union(new SomeNodeElement[] { new SomeNodeElement() { Node = document.DocumentNode, Url = newUri } }).ToArray();
                            }
                            catch(Exception)
                            {

                            }
                    }
                }

                var allLinks =
                        allIMGLinks
                        .Union(allLINKLinks)
                        .Union(allMETALinks)
                        .Union(allAdditionalLinks)
                        //.Distinct()
                        .GroupBy(ne => ne.Url)
                        .Select(neg => neg.First())
                        .ToArray();

                if (allLinks.Length == 0 && string.IsNullOrWhiteSpace(document.DocumentNode.InnerText))
                    allLinks = new SomeNodeElement[] { new SomeNodeElement() { Node = document.DocumentNode, Url = new Uri(responseUrl) } };

                //List<SomeNodeElement> distinctedLinks = new List<SomeNodeElement>(allLinks);
                //for (int i = distinctedLinks.Count - 1; i >= 0; i--)
                //{
                //    int lnToDel = distinctedLinks.Count(i2 => i2.Url.AbsoluteUri == distinctedLinks[i].Url.AbsoluteUri && i2 != distinctedLinks[i]);
                //    if (lnToDel > 0)
                //        distinctedLinks.RemoveAt(i);
                //}
                //return distinctedLinks.ToArray();
                return allLinks;
            }
            catch(Exception ex)
            {
                wasException = true;
                Helpers.Old.Log.Add(logSession, ex);
                throw ex;
            }
            finally
            {
                Helpers.Old.Log.SessionEnd(logSession, wasException);
            }
        }