Beispiel #1
0
        /// <summary>
        ///     获取从头开始的最大公共子串
        /// </summary>
        /// <returns></returns>
        public static XPath GetMaxCompareXPath(IList <XPath> items)
        {
            int minlen = items.Min(d => d.Count);

            string c = null;
            int    i = 0;

            for (i = 0; i < minlen; i++)
            {
                for (int index = 0; index < items.Count; index++)
                {
                    XPath path = items[index];
                    if (index == 0)
                    {
                        c = path[i];
                    }
                    else
                    {
                        if (c != path[i])
                        {
                            goto OVER;
                        }
                    }
                }
            }
OVER:
            XPath first = items.First().SubXPath(i + 1);

            first.RemoveFinalNum();
            return(first);
        }
Beispiel #2
0
        /// <summary>
        ///     计算可能是列表根节点的概率,同时将值保存在dict中。
        /// </summary>
        /// <param name="node"></param>
        /// <param name="dict"></param>
        private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict,
                                                    bool haschild)
        {
            if (nodes.Count == 0)
            {
                return;
            }
            var node  = nodes[0];
            var xpath = XPath.RemoveFinalNum(node.XPath);

            if (haschild)
            {
                foreach (var htmlNode in nodes)
                {
                    GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict,
                                            haschild);
                }
            }

            var avanode = nodes.ToList();

            if (avanode.Count < 3)
            {
                return;
            }
            //if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count*0.7)
            //    return;

            var childCount = (double)avanode.Count;

            var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray();
            var v           = childCounts.Variance();

            //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的
            if (v > 100)
            {
                return;
            }

            var leafCount = avanode.Last().GetLeafNodeCount();
            var value     = (childCount * PM25 + leafCount) * (v == 0 ? 2 : (Math.Log10((100 - v) / 100)));

            if (xpath.Contains("你"))
            {
                Console.WriteLine(xpath);
            }
            dict.SetValue(xpath, value);
        }
Beispiel #3
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                var isForceItemOne = false; //强制进入只有一个属性的模式
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));
                    var nodes = doc2.DocumentNode.SelectNodes(shortv);
                    if (nodes == null || nodes.Count == 0)
                    {
                        yield break;
                    }

                    if (nodes.Count == 1)
                    {
                        isForceItemOne = true;
                    }
                    else
                    {
                        var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                        var target = getCrawTarget(items, shortv);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }

                if (isForceItemOne || existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (isForceItemOne || existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        if (items.Count == 0)
                        {
                            continue;
                        }
                        var target   = getCrawTarget(items, keyValuePair.Key);
                        var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode;
                        if (rootNode == null)
                        {
                            continue;
                        }

                        target.Html        = rootNode.InnerHtml;
                        target.Text        = rootNode.InnerText;
                        target.NodeCount   = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count;
                        target.Score       = keyValuePair.Value;
                        target.ColumnCount = items.Count;
                        yield return(target);
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root   = doc2.DocumentNode.SelectSingleNode(rootPath);
                    var xpath  = XPath.RemoveFinalNum(root.XPath);
                    var target = getCrawTarget(items, xpath);
                    target.RootXPath = rootPath;
                    yield return(target);
                }
            }
        }