public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); bool isjson; content = JavaScriptAnalyzer.Json2XML(content, out isjson, IsSuperMode); return(content); }
private void Design() { (!string.IsNullOrWhiteSpace(CrawlerSelector)).SafeCheck("采集器名称不能为空"); var isRealJson = false; var newhtml = JavaScriptAnalyzer.Json2XML(lastData, out isRealJson, true); if (!(isRealJson).SafeCheck("只有标准json格式才能启用采集器设计")) { return; } var selector = GetCrawler(CrawlerSelector); if (selector == null) { if (MessageBox.Show($"是否要创建名为{CrawlerSelector}的网页采集器?", "提示信息", MessageBoxButton.OKCancel) != MessageBoxResult.OK) { return; } var crawler = new SmartCrawler(); crawler.Name = CrawlerSelector; processManager.CurrentProcessCollections.Add(crawler); selector = crawler; } (MainDescription.MainFrm as IDockableManager).ActiveThisContent(CrawlerSelector); selector.URLHTML = newhtml; selector.HtmlDoc.LoadHtml(newhtml); selector.enableRefresh = false; //selector.GreatHand(); }
public override object TransformData(IFreeDocument datas) { var item = datas[Column]; if (item == null || string.IsNullOrWhiteSpace(item.ToString())) { return(null); } bool isrealjson; var itemstr = item.ToString(); if (lastData == null) { var html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true); if (isrealjson) { lastData = itemstr; } } if (crawlerEnabled) { var html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true); if (isrealjson) { HtmlDocument htmldoc = null; var doc = selector.CrawlHtmlData(html, out htmldoc).FirstOrDefault(); doc.DictCopyTo(datas); } return(null); } dynamic d = null; try { d = serialier.DeserializeObject(item.ToString()); } catch (Exception ex) { SetValue(datas, ex.Message); // XLogSys.Print.Error(ex); return(null); } if (ScriptWorkMode == ScriptWorkMode.单文档) { var newdoc = ScriptHelper.ToDocument(d) as FreeDocument; newdoc.DictCopyTo(datas); } else { SetValue(datas, d); } return(null); }
public override IEnumerable <IFreeDocument> TransformManyData(IEnumerable <IFreeDocument> datas, AnalyzeItem analyzer) { foreach (var data in datas) { var item = data[Column].ToString(); if (string.IsNullOrEmpty(item)) { continue; } var itemstr = item; lastData = itemstr; if (crawlerEnabled) { bool isrealjson; var html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true); if (isrealjson) { HtmlDocument htmldoc = null; var doc = selector.CrawlHtmlData(html, out htmldoc); foreach (var item3 in doc) { yield return(item3.MergeQuery(data, NewColumn)); } } continue; } dynamic d = null; try { d = serialier.DeserializeObject(itemstr); } catch (Exception ex) { // XLogSys.Print.Error(ex); continue; } foreach (var item2 in ScriptHelper.ToDocuments(d)) { var item3 = item2 as FreeDocument; yield return(item3.MergeQuery(data, NewColumn)); } } }