コード例 #1
0
        public static string GetTextNode(this HtmlNode node)
        {
            var para = new ParaClass();

            node.GetTextRootProbability(para);

            return(XPath.SubXPath(para.Path, -2));
        }
コード例 #2
0
ファイル: XPathAnalyzer.cs プロジェクト: shoff/Hawk
        public static string GetTextNode(this HtmlNode node)
        {
            var para = new ParaClass();

            node.GetTextRootProbability(para);

            var path = new XPath(para.Path);

            return(path.SubXPath(0, path.Count - 2).ToString());
        }
コード例 #3
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                var isForceItemOne = false; //强制进入只有一个属性的模式
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));
                    var nodes = doc2.DocumentNode.SelectNodes(shortv);
                    if (nodes == null || nodes.Count == 0)
                    {
                        yield break;
                    }

                    if (nodes.Count == 1)
                    {
                        isForceItemOne = true;
                    }
                    else
                    {
                        var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                        var target = getCrawTarget(items, shortv);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }

                if (isForceItemOne || existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (isForceItemOne || existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        if (items.Count == 0)
                        {
                            continue;
                        }
                        var target   = getCrawTarget(items, keyValuePair.Key);
                        var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode;
                        if (rootNode == null)
                        {
                            continue;
                        }

                        target.Html        = rootNode.InnerHtml;
                        target.Text        = rootNode.InnerText;
                        target.NodeCount   = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count;
                        target.Score       = keyValuePair.Value;
                        target.ColumnCount = items.Count;
                        yield return(target);
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root   = doc2.DocumentNode.SelectSingleNode(rootPath);
                    var xpath  = XPath.RemoveFinalNum(root.XPath);
                    var target = getCrawTarget(items, xpath);
                    target.RootXPath = rootPath;
                    yield return(target);
                }
            }
        }
コード例 #4
0
ファイル: XPathAnalyzer.cs プロジェクト: zoomeye2048/Hawk
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));

                    var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                    var target = getCrawTarget(items, shortv);
                    if (target != null)
                    {
                        yield return(target);
                    }
                    yield break;
                }

                if (existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items  = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        var target = getCrawTarget(items, keyValuePair.Key);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root = doc2.DocumentNode.SelectSingleNode(rootPath);

                    foreach (var crawlItem in items)
                    {
                        crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath);
                    }

                    var target = getCrawTarget(items, rootPath);
                    if (target != null)
                    {
                        yield return(target);
                    }
                }
            }
        }