public void TestCreateRule() { //RuleLiteDB.Remove(1); var rule = new RuleModel(); rule.Url = "http://www.ruijihg.com/2018/05/24/json-net-%e5%8f%8d%e5%ba%8f%e5%88%97%e5%8c%96%e6%8e%a5%e5%8f%a3%e9%9b%86%e5%90%88/"; rule.Domain = new Uri(rule.Url).GetDomain(); rule.Expression = "http://www.ruijihg.com/????/??/??/*"; var block = new ExtractBlock() { Selectors = new List <ISelector> { new CssSelector("#main", CssTypeEnum.INNERHTML) }, Metas = new ExtractMetaCollection() }; block.Metas.AddMeta("time", new List <ISelector> { new CssSelector("time", CssTypeEnum.TEXT) }); block.Metas.AddMeta("author", new List <ISelector> { new CssSelector(".author", CssTypeEnum.TEXT) }); block.Metas.AddMeta("content", new List <ISelector> { new CssSelector(".entry-content", CssTypeEnum.INNERHTML) }); rule.BlockExpression = JsonConvert.SerializeObject(block); //rule.Id = 1; RuleLiteDb.AddOrUpdate(rule); Assert.True(rule.Id > 0); }
protected List <string> ExtractFeedAddress(Snapshot snapshot) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (!string.IsNullOrEmpty(snapshot.Expression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors; block.TileSelector.Selectors.AddRange(s); } var result = RuiJiExtractor.Extract(snapshot.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
/// <summary> /// parse tile /// </summary> /// <param name="expression">tile expression</param> /// <returns>extract tile</returns> public static ExtractTile ParserTile(string expression) { expression = expression.Trim(); var lines = Split(expression, new string[] { @"\s+\[meta\]", @"\s+\[paging\]" }); var tile = new ExtractTile(); foreach (var key in lines.Keys) { switch (key.Trim()) { case "": { var b = ParserBase(lines[key]); tile.Name = b.Name; tile.Selectors = b.Selectors; break; } case "[meta]": { tile.Metas = ParserMeta(lines[key]); break; } case "[paging]": { var b = new ExtractBlock("_paging"); b.TileSelector = ParserTile(lines[key]); tile.Paging = b; break; } } } return(tile); }
/// <summary> /// parse block /// </summary> /// <param name="expression">block expression</param> /// <returns>extract block</returns> public static ExtractBlock ParserBlock(string expression) { expression = expression.Replace("\r\n", "\n").Trim(); var blockExps = ParserBlocks(expression); var results = new List <ExtractBlock>(); foreach (var exp in blockExps) { var block = new ExtractBlock(); var blockExp = exp.Replace("\r\n", "\n").Trim(); var lines = Split(blockExp, new string[] { @"\[block\]", @"\[blocks\]", @"\[tile\]", @"\[meta\]", @"\[paging\]" }); foreach (var key in lines.Keys) { switch (key) { case "": case "[block]": { var b = ParserBase(lines[key]); block.Name = b.Name; block.Selectors = b.Selectors; break; } case "[storage]": { break; } case "[blocks]": { var bs = lines[key].Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (var b in bs) { if (b.Trim().StartsWith("@")) { block.Blocks.Add(new ExtractBlock(b.Trim().TrimStart('@'))); } } break; } case "[tile]": { block.TileSelector = ParserTile(lines[key]); break; } case "[meta]": { block.Metas = ParserMeta(lines[key]); break; } case "[paging]": { var b = new ExtractBlock("_paging"); b.TileSelector = ParserTile(lines[key]); block.Paging = b; block.Blocks.Add(b); break; } } } if (block.TileSelector.Paging != null) { block.Blocks.RemoveAll(m => m.Name == "_paging"); block.Blocks.Add(block.TileSelector.Paging); } results.Add(block); } var removes = new List <ExtractBlock>(); foreach (var result in results) { for (int j = 0; j < result.Blocks.Count; j++) { var sub = results.Where(m => m != null).SingleOrDefault(m => m.Name == result.Blocks[j].Name); if (sub != null) { result.Blocks[j] = sub; removes.Add(sub); } } } results.RemoveAll(m => removes.Contains(m)); return(results.First()); }
public static string ToExpression(ExtractBlock block) { var expression = new List <string>(); if (block.Selectors.Count > 0) { expression.Add("[block]"); expression.AddRange(GetSelectorsExpression(block)); } if (block.Blocks.Count(m => m.Name != "_paging") > 0) { expression.Add("[blocks]"); foreach (var b in block.Blocks) { if (b.Name == "_paging") { continue; } expression.Add("@" + b.Name); } expression[expression.Count - 1] += "\r\n"; } if (block.TileSelector.Selectors.Count > 0) { expression.Add("[tile]"); expression.AddRange(GetSelectorsExpression(block.TileSelector)); if (block.TileSelector.Metas.Count > 0) { expression.Add("\t[meta]"); foreach (var meta in block.TileSelector.Metas) { expression.AddRange(GetSelectorsExpression(meta.Value, 1)); } } if (block.TileSelector.Paging != null) { expression.Add("\t[paging]"); expression.AddRange(GetSelectorsExpression(block.TileSelector.Paging.TileSelector, 1)); } } if (block.Metas.Count > 0) { expression.Add("[meta]"); foreach (var meta in block.Metas) { expression.AddRange(GetSelectorsExpression(meta.Value)); } } if (block.Paging != null) { expression.Add("[paging]"); expression.AddRange(GetSelectorsExpression(block.Paging.TileSelector)); } if (block.Blocks.Count > 0) { foreach (var b in block.Blocks) { if (b.Name == "_paging") { continue; } expression.Add(ToExpression(b)); } } return(string.Join("\r\n", expression)); }