Beispiel #1
0
 public static ParseRule GetRule(HtmlNodeWithUrl[] nodes, string label, System.Drawing.Size minSize, bool collectIMGTags, bool collectLINKTags, bool collectMETATags)
 {
     ParseRule result =
            GetRuleByLink(nodes, label, collectIMGTags, collectLINKTags, collectMETATags, null)
         ?? GetRuleByXPath(nodes, label, collectIMGTags, collectLINKTags, collectMETATags, null)
         ?? GetRuleByLink(nodes, label, collectIMGTags, collectLINKTags, collectMETATags, minSize)
         ?? GetRuleByXPath(nodes, label, collectIMGTags, collectLINKTags, collectMETATags, minSize)
         ?? new ParseRule();
     return result;
 }
Beispiel #2
0
        private static ParseRule GetRuleByXPath(HtmlNodeWithUrl[] nodesarr, string label, bool collectIMGTags, bool collectLINKTags, bool collectMETATags, System.Drawing.Size? minSize = null)
        {
            ParseRule result = null;

            if (nodesarr != null && nodesarr.Length > 0)
            {
                #region ByXPath
                string mask1 = string.Empty;
                string mask2 = string.Empty;

                string[] xpaths = nodesarr.Select(n => n.Node.XPath).ToArray();
                mask1 = LongestMaskedPathBetween(xpaths);
                mask2 = LongestMaskedStringBetween(xpaths);

                if (
                        nodesarr
                            .Select(n => new { Node = n.Node.OwnerDocument.DocumentNode, Url = n.Url })
                            .Select(nodeItem =>
                                {
                                    var links = Helper
                                                .GetAllImagesUrlsFromUrl(nodeItem.Node.OwnerDocument, nodeItem.Url.AbsoluteUri, collectIMGTags, collectLINKTags, collectMETATags, null)
                                                .Where(n => Helper.StringLikes(n.Node.XPath, mask1));
                                    return
                                        minSize == null
                                        ? links.Count()
                                        : Helper.GetAllImagesUrlsWithMinSize(links.ToArray(), minSize.Value).Count();
                                }
                            )
                            .Where(c => c != 1)
                            .Count() == 0)
                    result = new ParseRule()
                    {
                        Label = label,
                        Condition = ParseFindRuleCondition.ByXPath,
                        Parameter = mask1
                    };

                if (
                        nodesarr
                            .Select(n => new { Node = n.Node.OwnerDocument.DocumentNode, Url = n.Url })
                            .Select(nodeItem =>
                                {
                                    var links = Helper
                                                .GetAllImagesUrlsFromUrl(nodeItem.Node.OwnerDocument, nodeItem.Url.AbsoluteUri, collectIMGTags, collectLINKTags, collectMETATags, null)
                                                .Where(n => Helper.StringLikes(n.Node.XPath, mask2));
                                    return
                                        minSize == null
                                        ? links.Count()
                                        : Helper.GetAllImagesUrlsWithMinSize(links.ToArray(), minSize.Value).Count();
                                }
                            )
                            .Where(c => c != 1)
                            .Count() == 0)
                    result = new ParseRule()
                    {
                        Label = label,
                        Condition = ParseFindRuleCondition.ByXPath,
                        Parameter = mask2
                    };

                #endregion
                #region ByXPathAndIndex
                if (result == null)
                {
                    string betterMask = mask2;
                    int index =
                        nodesarr
                            .Select(n => new { Doc = n.Node.OwnerDocument, Node = n.Node.OwnerDocument.DocumentNode, Url = n.Url })
                            .Select(n =>
                                {
                                    var links = Helper
                                            .GetAllImagesUrlsFromUrl(n.Node.OwnerDocument, n.Url.AbsoluteUri, collectIMGTags, collectLINKTags, collectMETATags, null)
                                            .Where(i => Helper.StringLikes(i.Node.XPath, betterMask));

                                    string[] images =
                                        (minSize == null
                                            ? links.ToArray()
                                            : Helper.GetAllImagesUrlsWithMinSize(links.ToArray(), minSize.Value)
                                        )
                                        .Select(i => i.Url.AbsoluteUri)
                                        .ToArray();

                                    for (int i = 0; i < images.Length; i++)
                                        if (images[i].ToLower() == n.Url.AbsoluteUri.ToLower())
                                            return i;
                                    return -1;
                                }
                            ).Distinct().OrderBy( i => i).FirstOrDefault();
                    if (index != -1)
                        result = new ParseRule()
                        {
                            Label = label,
                            Condition = ParseFindRuleCondition.ByXPathAndIndex,
                            Parameter = betterMask + ";" + index.ToString()
                        };
                }
                #endregion
            }

            if (result != null)
            {
                result.CheckImageSize = minSize != null ? true : false;
                if (minSize != null)
                    result.MinImageSize = minSize.Value;
                result.CollectIMGTags = collectIMGTags;
                result.CollectLINKTags = collectLINKTags;
                result.CollectMETATags = collectMETATags;
            }

            if (minSize != null && result == null)
            {
                System.Drawing.Size minCalcedSize = new System.Drawing.Size();

                foreach (var sz in Helper.GetImageSizes(nodesarr.Select(n => new SomeNodeElement() { Node = n.Node, Url = n.Url }).ToArray()).Select(n => n.Value))
                {
                    if (minCalcedSize.Width > sz.Width)
                        minCalcedSize.Width = sz.Width;
                    if (minCalcedSize.Height > sz.Height)
                        minCalcedSize.Height = sz.Height;
                }

                if (minSize.Value.Height < minCalcedSize.Height || minSize.Value.Width < minCalcedSize.Width)
                    result = GetRuleByLink(nodesarr, label, collectIMGTags, collectLINKTags, collectMETATags, minCalcedSize);
            }

            return result;
        }