Exemplo n.º 1
0
        public void TestCreateRule()
        {
            //RuleLiteDB.Remove(1);

            var rule = new RuleModel();

            rule.Url        = "http://www.ruijihg.com/2018/05/24/json-net-%e5%8f%8d%e5%ba%8f%e5%88%97%e5%8c%96%e6%8e%a5%e5%8f%a3%e9%9b%86%e5%90%88/";
            rule.Domain     = new Uri(rule.Url).GetDomain();
            rule.Expression = "http://www.ruijihg.com/????/??/??/*";

            var block = new ExtractBlock()
            {
                Selectors = new List <ISelector>
                {
                    new CssSelector("#main", CssTypeEnum.INNERHTML)
                },
                Metas = new ExtractMetaCollection()
            };

            block.Metas.AddMeta("time", new List <ISelector> {
                new CssSelector("time", CssTypeEnum.TEXT)
            });

            block.Metas.AddMeta("author", new List <ISelector> {
                new CssSelector(".author", CssTypeEnum.TEXT)
            });

            block.Metas.AddMeta("content", new List <ISelector> {
                new CssSelector(".entry-content", CssTypeEnum.INNERHTML)
            });

            rule.BlockExpression = JsonConvert.SerializeObject(block);

            //rule.Id = 1;
            RuleLiteDb.AddOrUpdate(rule);

            Assert.True(rule.Id > 0);
        }
Exemplo n.º 2
0
        protected List <string> ExtractFeedAddress(Snapshot snapshot)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (!string.IsNullOrEmpty(snapshot.Expression))
            {
                block.TileSelector.Selectors.Clear();

                var parser = new RuiJiParser();

                var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors;
                block.TileSelector.Selectors.AddRange(s);
            }

            var result  = RuiJiExtractor.Extract(snapshot.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Exemplo n.º 3
0
        /// <summary>
        /// parse tile
        /// </summary>
        /// <param name="expression">tile expression</param>
        /// <returns>extract tile</returns>
        public static ExtractTile ParserTile(string expression)
        {
            expression = expression.Trim();
            var lines = Split(expression, new string[] { @"\s+\[meta\]", @"\s+\[paging\]" });

            var tile = new ExtractTile();

            foreach (var key in lines.Keys)
            {
                switch (key.Trim())
                {
                case "":
                {
                    var b = ParserBase(lines[key]);
                    tile.Name      = b.Name;
                    tile.Selectors = b.Selectors;
                    break;
                }

                case "[meta]":
                {
                    tile.Metas = ParserMeta(lines[key]);
                    break;
                }

                case "[paging]":
                {
                    var b = new ExtractBlock("_paging");
                    b.TileSelector = ParserTile(lines[key]);
                    tile.Paging    = b;
                    break;
                }
                }
            }

            return(tile);
        }
Exemplo n.º 4
0
        /// <summary>
        /// parse block
        /// </summary>
        /// <param name="expression">block expression</param>
        /// <returns>extract block</returns>
        public static ExtractBlock ParserBlock(string expression)
        {
            expression = expression.Replace("\r\n", "\n").Trim();

            var blockExps = ParserBlocks(expression);
            var results   = new List <ExtractBlock>();

            foreach (var exp in blockExps)
            {
                var block    = new ExtractBlock();
                var blockExp = exp.Replace("\r\n", "\n").Trim();
                var lines    = Split(blockExp, new string[] { @"\[block\]", @"\[blocks\]", @"\[tile\]", @"\[meta\]", @"\[paging\]" });

                foreach (var key in lines.Keys)
                {
                    switch (key)
                    {
                    case "":
                    case "[block]":
                    {
                        var b = ParserBase(lines[key]);
                        block.Name      = b.Name;
                        block.Selectors = b.Selectors;
                        break;
                    }

                    case "[storage]":
                    {
                        break;
                    }

                    case "[blocks]":
                    {
                        var bs = lines[key].Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries);
                        foreach (var b in bs)
                        {
                            if (b.Trim().StartsWith("@"))
                            {
                                block.Blocks.Add(new ExtractBlock(b.Trim().TrimStart('@')));
                            }
                        }
                        break;
                    }

                    case "[tile]":
                    {
                        block.TileSelector = ParserTile(lines[key]);
                        break;
                    }

                    case "[meta]":
                    {
                        block.Metas = ParserMeta(lines[key]);
                        break;
                    }

                    case "[paging]":
                    {
                        var b = new ExtractBlock("_paging");
                        b.TileSelector = ParserTile(lines[key]);
                        block.Paging   = b;
                        block.Blocks.Add(b);
                        break;
                    }
                    }
                }

                if (block.TileSelector.Paging != null)
                {
                    block.Blocks.RemoveAll(m => m.Name == "_paging");
                    block.Blocks.Add(block.TileSelector.Paging);
                }

                results.Add(block);
            }

            var removes = new List <ExtractBlock>();

            foreach (var result in results)
            {
                for (int j = 0; j < result.Blocks.Count; j++)
                {
                    var sub = results.Where(m => m != null).SingleOrDefault(m => m.Name == result.Blocks[j].Name);

                    if (sub != null)
                    {
                        result.Blocks[j] = sub;
                        removes.Add(sub);
                    }
                }
            }

            results.RemoveAll(m => removes.Contains(m));

            return(results.First());
        }
Exemplo n.º 5
0
        public static string ToExpression(ExtractBlock block)
        {
            var expression = new List <string>();

            if (block.Selectors.Count > 0)
            {
                expression.Add("[block]");
                expression.AddRange(GetSelectorsExpression(block));
            }

            if (block.Blocks.Count(m => m.Name != "_paging") > 0)
            {
                expression.Add("[blocks]");
                foreach (var b in block.Blocks)
                {
                    if (b.Name == "_paging")
                    {
                        continue;
                    }

                    expression.Add("@" + b.Name);
                }
                expression[expression.Count - 1] += "\r\n";
            }

            if (block.TileSelector.Selectors.Count > 0)
            {
                expression.Add("[tile]");
                expression.AddRange(GetSelectorsExpression(block.TileSelector));

                if (block.TileSelector.Metas.Count > 0)
                {
                    expression.Add("\t[meta]");
                    foreach (var meta in block.TileSelector.Metas)
                    {
                        expression.AddRange(GetSelectorsExpression(meta.Value, 1));
                    }
                }

                if (block.TileSelector.Paging != null)
                {
                    expression.Add("\t[paging]");
                    expression.AddRange(GetSelectorsExpression(block.TileSelector.Paging.TileSelector, 1));
                }
            }

            if (block.Metas.Count > 0)
            {
                expression.Add("[meta]");
                foreach (var meta in block.Metas)
                {
                    expression.AddRange(GetSelectorsExpression(meta.Value));
                }
            }

            if (block.Paging != null)
            {
                expression.Add("[paging]");
                expression.AddRange(GetSelectorsExpression(block.Paging.TileSelector));
            }

            if (block.Blocks.Count > 0)
            {
                foreach (var b in block.Blocks)
                {
                    if (b.Name == "_paging")
                    {
                        continue;
                    }

                    expression.Add(ToExpression(b));
                }
            }

            return(string.Join("\r\n", expression));
        }