public async Task <ActionResult <CrawlItem> > PostCrawlItem([FromForm] CrawlItem crawlItem) { _context.CrawlItems.Add(crawlItem); await _context.SaveChangesAsync(); return(CreatedAtAction("GetCrawlItem", new { id = crawlItem.Id }, crawlItem)); }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; if (!string.IsNullOrEmpty(RootXPath)) { var root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath).ParentNode; var node = HtmlDoc.DocumentNode.SelectSingleNode(path); if (!node.IsAncestor(root)) { if (isAlert) { MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath"); } return; } path = new XPath(node.XPath).TakeOff(root.XPath).ToString(); } var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; if (CrawlItems.Any(d => d.Name == SelectName)) { SelectName = "属性" + CrawlItems.Count; if (isAlert) { MessageBox.Show($"已存在名称为{SelectName}的属性,不能重复添加"); return; } } CrawlItems.Add(item); SelectXPath = ""; }
public async Task <IActionResult> PutCrawlItem([FromForm] int id, [FromForm] CrawlItem crawlItem) { if (id != crawlItem.Id) { return(BadRequest()); } _context.Entry(crawlItem).State = EntityState.Modified; try { await _context.SaveChangesAsync(); } catch (DbUpdateConcurrencyException) { if (!CrawlItemExists(id)) { return(NotFound()); } else { throw; } } return(NoContent()); }
public override void DictDeserialize(IDictionary <string, object> dicts, Scenario scenario = Scenario.Database) { base.DictDeserialize(dicts, scenario); URL = dicts.Set("URL", URL); RootXPath = dicts.Set("RootXPath", RootXPath); Remark = dicts.Set("Remark", Remark); RootFormat = dicts.Set("RootFormat", RootFormat); ShareCookie.SelectItem = dicts.Set("ShareCookie", ShareCookie.SelectItem); IsMultiData = dicts.Set("IsMultiData", IsMultiData); IsSuperMode = dicts.Set("IsSuperMode", IsSuperMode); if (dicts.ContainsKey("HttpSet")) { var doc2 = dicts["HttpSet"]; var p = doc2 as IDictionary <string, object>; Http.UnsafeDictDeserialize(p); } if (dicts.ContainsKey("Generator")) { var doc2 = dicts["Generator"]; var p = doc2 as IDictionary <string, object>; } var doc = dicts as FreeDocument; if (doc?.Children != null) { foreach (var child in doc.Children) { var item = new CrawlItem(); item.DictDeserialize(child); CrawlItems.Add(item); } } }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; if (!string.IsNullOrEmpty(RootXPath)) { //TODO: 当XPath路径错误时,需要捕获异常 HtmlNode root = null; try { root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath); } catch (Exception ex) { XLogSys.Print.Error($"{RootXPath} 不能被识别为正确的XPath表达式,请检查"); } if (!(root != null).SafeCheck("使用当前父节点XPath,在文档中找不到任何父节点")) { return; } root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath)?.ParentNode; HtmlNode node = null; if ( !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNode(path), ref node, LogType.Info, "检查子节点XPath正确性", true)) { return; } if (!(node != null).SafeCheck("使用当前子节点XPath,在文档中找不到任何子节点")) { return; } if (!node.IsAncestor(root) && isAlert) { if ( MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath,是否依然要添加?", "提示信息", MessageBoxButton.YesNo) == MessageBoxResult.No) { return; } } path = XPath.TakeOff(node.XPath, root.XPath); } if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null || MessageBox.Show("已经存在同名的属性,是否依然添加?", "提示信息", MessageBoxButton.OKCancel) == MessageBoxResult.OK) { var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; CrawlItems.Add(item); SelectXPath = ""; SelectName = ""; XLogSys.Print.Info("成功添加属性"); } }
public override void DictDeserialize(IDictionary <string, object> dicts, Scenario scenario = Scenario.Database) { base.DictDeserialize(dicts, scenario); URL = dicts.Set("URL", URL); RootXPath = dicts.Set("RootXPath", RootXPath); IsMultiData = dicts.Set("IsMultiData", IsMultiData); IsJson2xml = dicts.Set("IsJson2xml", IsJson2xml); Crawler = dicts.Set("Crawler", Crawler); if (dicts.ContainsKey("HttpSet")) { var doc2 = dicts["HttpSet"]; var p = doc2 as IDictionary <string, object>; Http.UnsafeDictDeserialize(p); } if (dicts.ContainsKey("Generator")) { var doc2 = dicts["Generator"]; var p = doc2 as IDictionary <string, object>; } var doc = dicts as FreeDocument; if (doc?.Children != null) { foreach (var child in doc.Children) { var item = new CrawlItem(); item.DictDeserialize(child); CrawlItems.Add(item); } } }
public override List <CrawlResult> Process(List <CrawlResult> results) { // CrawlResult result = new CrawlResult(); foreach (var result in results) { var document = PageCrawler.GetPage(result.Url).Result; // var elements = document.Result.QuerySelectorAll(this.Selector); foreach (var item in this.CrawlItems) { var _newItem = new CrawlItem { Name = item.Name, Selector = item.Selector, Attr = item.Attr }; if (string.IsNullOrWhiteSpace(_newItem.Attr)) { _newItem.Value = document.QuerySelector(_newItem.Selector)?.InnerHtml; } else { _newItem.Value = document.QuerySelector(_newItem.Selector)?.GetAttribute(_newItem.Attr); } result.CrawlItems.Add(_newItem); } // results.Add(result); } return(results); }
public List <CrawlItem> Execute() { if (caches.Count > 100) { caches.Clear(); } Encoding encoding = Encoding.GetEncoding("GBK"); HtmlDocument document = HtmlAdapter.LoadDocument("http://xiaohua.zol.com.cn/new/1.html", encoding); var nodes = document.DocumentNode.SelectNodes(".//li[@class='article-summary']"); if (nodes == null) { return(null); } List <CrawlItem> list = new List <CrawlItem>(); foreach (var item in nodes) { var aNode = item.SelectSingleNode("span[2]/a"); string href = aNode.GetAttributeValue("href", string.Empty); if (!string.IsNullOrWhiteSpace(href) && !caches.Contains(href)) { caches.Add(href); HtmlDocument document1 = HtmlAdapter.LoadDocument("http://xiaohua.zol.com.cn" + href, encoding); var titleNode = document1.DocumentNode.SelectSingleNode(".//h1[@class='article-title']"); var contentNode = document1.DocumentNode.SelectSingleNode(".//div[@class='article-text']"); if (titleNode != null && contentNode != null) { var model = new CrawlItem { Title = titleNode.InnerText, Contents = contentNode.InnerHtml, CatalogId = 5, AccountId = 1, }; if (model.Title.Length < 8) { model.Tags = new List <string>() { model.Title }; } list.Add(model); } } Thread.Sleep(5000); // 停留5秒 } return(list); }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; if (!string.IsNullOrEmpty(RootXPath)) { //TODO: 当XPath路径错误时,需要捕获异常 var root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath); if (!(root != null).SafeCheck("使用当前父节点XPath,在文档中找不到任何父节点")) { return; } root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath)?.ParentNode; var node = HtmlDoc.DocumentNode.SelectSingleNode(path); if (!(node != null).SafeCheck("使用当前子节点XPath,在文档中找不到任何子节点")) { return; } if (!node.IsAncestor(root)) { if (isAlert) { if ( MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath,是否依然要添加?", "提示信息", MessageBoxButton.YesNo) == MessageBoxResult.Yes) { path = new XPath(node.XPath).TakeOff(root.XPath).ToString(); } else { return; } } } } var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; if (CrawlItems.Any(d => d.Name == SelectName)) { SelectName = "属性" + CrawlItems.Count; if (isAlert) { MessageBox.Show($"已存在名称为{SelectName}的属性,不能重复添加"); return; } } CrawlItems.Add(item); SelectXPath = ""; }
private void AddNewItem(bool isAlert = true) { var item = new CrawlItem { XPath = SelectXPath, Name = SelectName, SampleData1 = SelectText }; if (CrawlItems.Any(d => d.Name == SelectName)) { SelectName = "属性" + CrawlItems.Count; if (isAlert) { MessageBox.Show($"已存在名称为{SelectName}的属性,不能重复添加"); return; } } CrawlItems.Add(item); SelectXPath = ""; }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; var rootPath = RootXPath; if (!string.IsNullOrEmpty(rootPath)) { //TODO: 当XPath路径错误时,需要捕获异常 HtmlNode root = null; try { root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat); } catch (Exception) { XLogSys.Print.Error(string.Format(GlobalHelper.Get("key_662"), RootXPath, RootFormat)); } if (!(root != null).SafeCheck(string.Format(GlobalHelper.Get("key_663"), RootFormat, RootXPath))) { return; } root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode; HtmlNode node = null; if ( !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat), ref node, LogType.Info, GlobalHelper.Get("key_664"), true)) { return; } if (!(node != null).SafeCheck(GlobalHelper.Get("key_665"))) { return; } if (!node.IsAncestor(root) && isAlert) { if ( MessageBox.Show(GlobalHelper.Get("key_666"), GlobalHelper.Get("key_99"), MessageBoxButton.YesNo) == MessageBoxResult.No) { return; } } string attr = ""; string attrValue = ""; XPathAnalyzer.GetAttribute(path, out attr, out attrValue); if (SearchFormat == SelectorFormat.XPath) { path = XPath.TakeOffPlus(node.XPath, root.XPath); if (attr != "") { path += "/@" + attr + "[1]"; } } } if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null || MessageBox.Show(GlobalHelper.Get("add_column_sure"), GlobalHelper.Get("key_99"), MessageBoxButton.OKCancel) == MessageBoxResult.OK) { var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; item.Format = SearchFormat; CrawlItems.Add(item); SelectXPath = ""; SelectName = ""; XLogSys.Print.Info(GlobalHelper.Get("key_668")); } }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; var rootPath = RootXPath; if (!string.IsNullOrEmpty(rootPath)) { //TODO: 当XPath路径错误时,需要捕获异常 HtmlNode root = null; try { root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat); } catch (Exception ex) { XLogSys.Print.Error($"{RootXPath} 不能被识别为正确的{RootFormat}表达式,请检查"); } if (!(root != null).SafeCheck($"使用当前父节点{RootFormat} {RootXPath},在文档中找不到任何父节点")) { return; } root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode; HtmlNode node = null; if ( !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat), ref node, LogType.Info, "检查子节点XPath正确性", true)) { return; } if (!(node != null).SafeCheck("使用当前子节点XPath,在文档中找不到任何子节点")) { return; } if (!node.IsAncestor(root) && isAlert) { if ( MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath,是否依然要添加?", "提示信息", MessageBoxButton.YesNo) == MessageBoxResult.No) { return; } } string attr = ""; string attrValue = ""; XPathAnalyzer.GetAttribute(path, out attr, out attrValue); if (SearchFormat == SelectorFormat.XPath) { path = XPath.TakeOffPlus(node.XPath, root.XPath); if (attr != "") { path += "/@" + attr + "[1]"; } } } if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null || MessageBox.Show("已经存在同名的属性,是否依然添加?", "提示信息", MessageBoxButton.OKCancel) == MessageBoxResult.OK) { var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; item.Format = SearchFormat; CrawlItems.Add(item); SelectXPath = ""; SelectName = ""; XLogSys.Print.Info("成功添加属性"); } }