private void paserData(INode node, string url, int routeId) { Parser parser = Parser.CreateParser(GetHtmlStr(url), "utf-8"); NodeFilter filter = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "clearfix")); NodeList msgList = parser.Parse(filter); //saveMessage(msgList, routeId); parser.Reset(); NodeFilter pagefilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "review-menu clearfix")); NodeList pageList = parser.Parse(pagefilter); parser = Parser.CreateParser(pageList.ToHtml(), "utf-8"); NodeFilter filterA = new NodeClassFilter(typeof(ATag)); NodeList pgList = parser.Parse(filterA); for (int i = 1; i < pgList.Count; i++) { string url2 = "http://www.yododo.com" + getTag(pgList[i]).GetAttribute("href"); parser = Parser.CreateParser(GetHtmlStr(url2), "utf-8"); //NodeFilter filter = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "clearfix")); msgList = parser.Parse(filter); //saveMessage(msgList, routeId); if (i == 3) { break; } } }
public LinkStatus ExtractLinks() { if (String.Empty == m_strUrl) { throw new ArgumentException("No URL specified"); } m_Links = new LinkDataCollection(); CreateParser(); if (m_obParser.Lexer.Page.mSource == null) { return(LinkStatus.Broken); } NodeFilter obFilter = new NodeClassFilter(typeof(ATag)); NodeList collNodes = m_obParser.Parse(obFilter); if (null != collNodes) { PageData obPageData = new PageData(); obPageData.m_strUrl = m_obParser.URL; obPageData.m_iDepth = m_iLevel; for (Int32 i = 0; i < collNodes.Count; i++) { INode obNode = collNodes[i]; LinkData obLinkData = new LinkData(obPageData, obNode as ATag); m_Links.Add(obLinkData); } } return(LinkStatus.Ok); }
/// <summary> Search given node and pick up any objects of given type.</summary> /// <param name="node">The node to search. /// </param> /// <param name="type">The class to search for. /// </param> /// <returns> A node array with the matching nodes. /// </returns> public static INode[] FindTypeInNode(INode node, System.Type type) { INodeFilter filter; NodeList ret; ret = new NodeList(); filter = new NodeClassFilter(type); node.CollectInto(ret, filter); return(ret.ToNodeArray()); }
/// <summary> /// Convert back to service model /// </summary> /// <returns></returns> public BrowseRequestModel ToServiceModel() { return(new BrowseRequestModel { NodeIdsOnly = NodeIdsOnly, NodeId = NodeId, MaxReferencesToReturn = MaxReferencesToReturn, Direction = Direction, View = View?.ToServiceModel(), NodeClassFilter = NodeClassFilter?.ToList(), ReferenceTypeId = ReferenceTypeId, TargetNodesOnly = TargetNodesOnly, ReadVariableValues = ReadVariableValues, NoSubtypes = NoSubtypes, Header = Header?.ToServiceModel() }); }
private static void MakeFilters() { NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList)); HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList"); poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter); poiFilter = new NodeClassFilter(typeof(DefinitionListBullet)); tasteFilter = new HasAttributeFilter("class", "score1"); environmentFilter = new HasAttributeFilter("class", "score2"); serviceFilter = new HasAttributeFilter("class", "score3"); averageFilter = new HasAttributeFilter("class", "average"); commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview")); HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname"); nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL")); addressFilter = new HasAttributeFilter("class", "address"); tagsFilter = new HasAttributeFilter("class", "tags"); }
/* private void WebBox_SourceUpdated(object sender, DataTransferEventArgs e) * { * * }*/ private void BtnDownload_Click(object sender, RoutedEventArgs e) { this.parseResult = ""; Uri uri = this.webBox.Source; #region //<N>基于Httphelper,这样下载会要求程序自己实现验证授权 //HttpHelper httpHelper = new HttpHelper(); //HttpItem rq = new HttpItem(); //rq.URL = uri.AbsoluteUri; //HttpResult html = httpHelper.GetHtml(rq); //Debug.WriteLine(html.Html); //直接基于WebBrowser,授权是由用户手动实现的 mshtml.IHTMLDocument2 doc2 = (mshtml.IHTMLDocument2)webBox.Document; string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0? s_htmlFake: doc2.body.innerHTML; Debug.WriteLine(html); #endregion #region 使用HtmlParser提取HTML Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow)); NodeList nodeList = parser.Parse(filter); if (nodeList.Count == 0) { MessageBox.Show("没有符合要求的节点"); } else { for (int i = 0; i < nodeList.Count; i++) { parserTR(nodeList[i]); } MessageBox.Show(parseResult); } /* parseResult = HtmlText(html); * MessageBox.Show(parseResult);*/ #endregion }
/// <summary> /// Gets all links contained in the page /// </summary> /// <returns></returns> public NodeList GetAllOutLinks() { INodeFilter filter = new NodeClassFilter(typeof(LinkTag)); return(this.Parse(filter)); }
public void StartCrawl()// private void BtnDownload_Click(object sender, RoutedEventArgs e) { List <ImportInvoiceDTO> list = new List <ImportInvoiceDTO>(); List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>(); this.parseResult = ""; Uri uri = new Uri(this.TargetUri); #region <N>基于Httphelper,这样下载会要求程序自己实现验证授权 //<N>基于Httphelper,这样下载会要求程序自己实现验证授权 //HttpHelper httpHelper = new HttpHelper(); //HttpItem rq = new HttpItem(); //rq.URL = uri.AbsoluteUri; //HttpResult html = httpHelper.GetHtml(rq); //Debug.WriteLine(html.Html); //直接基于WebBrowser,授权是由用户手动实现的 mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document; string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML; Debug.WriteLine(html); #endregion #region 使用HtmlParser提取HTML Lexer lexer = new Lexer(html); hParser.Parser parser = new hParser.Parser(lexer); hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow)); NodeList nodeList = parser.Parse(filter); if (nodeList.Count == 0) { MessageBox.Show("没有符合要求的节点"); } else { for (int i = 0; i < nodeList.Count; i++) { //抓取一行 var tagTR = parserTR(nodeList[i]); #region 充填有效行 if (tagTR != null) { validRowList.Add(tagTR); } #endregion } parserValidTR(validRowList, ref list); #if DEBUG // MessageBox.Show(parseResult); #endif } /* parseResult = HtmlText(html); * MessageBox.Show(parseResult);*/ #endregion #region 步 if (list == null || list.Count == 0) { MessageBox.Show("该页面上没有检测到预期数据"); return; } ImportInvoiceListDTO soap = new ImportInvoiceListDTO { List = list, Result = new ImportInvoiceResultDTO { Message = "CALLBACK", Status = 9 } }; //using (var factory = new ChannelFactory<ISyncImportInvoiceService>("*")) //{ // var chl = factory.CreateChannel(); // soap = chl.PullImportInvoices(soap); // if (soap.Result.Status == 0) // { // //重试 // soap = chl.PullImportInvoices(soap); // } //} //if (soap.Result.Status == -1) //{ // // 修改UI线程 // MessageBox.Show(soap.Result.Message); //} CallWS(soap); MessageBox.Show("本页已同步完成,请点击下一页继续同步"); //FakeBusy(); #endregion }
private void GrapBaiduMsg(HttpContext context) { string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/"); ClassLibrary.BLL.RouteClass rcBll = new ClassLibrary.BLL.RouteClass(); List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3"); Parser parser = Parser.CreateParser(sRslt, "utf-8"); NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liList = parser.Parse(filterUL); string links = liList[0].ToHtml(); parser = Parser.CreateParser(links, "utf-8"); NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag)); NodeList nodelist = parser.Parse(filterLI); //string strGn = nodelist[1].ToHtml(); string strCj = nodelist[0].ToHtml(); //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8"); NodeFilter filterA = new NodeClassFilter(typeof(ATag)); /*NodeList aGnList = parser.Parse(filterA); * for (int i = 0; i < aGnList.Count; i++) * { * ITag tag = getTag(aGnList[i]); * string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1"; //已解决 * string className = tag.ToPlainTextString(); * if (className == "全部") continue; * * ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; }); * if (model == null) continue; * * paserData(aGnList[i], url, model.ID); * }*/ parser = Parser.CreateParser(strCj, "utf-8"); NodeList areaCjList = parser.Parse(filterA); for (int i = 0; i < areaCjList.Count; i++) { ITag tag = getTag(areaCjList[i]); string url = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲 string className = tag.ToPlainTextString(); if (className == "全部" || className == "中国") { continue; } parser = Parser.CreateParser(GetHtmlStr(url), "utf-8"); //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liListCj = parser.Parse(filterUL); string linksCj = liListCj[0].ToHtml(); parser = Parser.CreateParser(linksCj, "utf-8"); //NodeFilter filterA = new NodeClassFilter(typeof(ATag)); NodeList aCjList = parser.Parse(filterA); for (int j = 0; j < aCjList.Count; j++) { ITag cjtag = getTag(aCjList[j]); string url1 = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决 string className1 = cjtag.ToPlainTextString(); if (className1 == "全部") { continue; } ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); }); if (model == null) { continue; } paserData(aCjList[j], url1, model.ID); } } Print(context, "success"); }
private void CrawlCurrentPage(WebBrowser wb, bool isOffline, bool IsUnConfirmChecked, ref bool hasValidData) { mshtml.IHTMLDocument2 doc2 = isOffline ? null : (mshtml.IHTMLDocument2)wb.Document; string html = isOffline ? s_htmlFake : doc2.body.innerHTML; Debug.WriteLine(html); List <ImportInvoiceDTO> list = new List <ImportInvoiceDTO>(); List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>(); //this.parseResult = ""; #region 使用IHTMLDocument2提取HTML mshtml.HTMLTableClass table = IsUnConfirmChecked ? (mshtml.HTMLTableClass)doc2.all.item("example1", 0) : (mshtml.HTMLTableClass)doc2.all.item("example", 0); if (table == null) { hasValidData = false; //throw new InvalidOperationException("无效table"); return; } mshtml.HTMLTableSectionClass tbody = (mshtml.HTMLTableSectionClass)table.lastChild; if (tbody == null) { hasValidData = false; //throw new InvalidOperationException("无效tbody"); return; } var tbodyHtml = tbody.innerHTML; if (0 == string.Compare(tbody.innerText, "没找到记录", StringComparison.InvariantCultureIgnoreCase)) { hasValidData = false; //throw new InvalidOperationException("无效tbody"); return; } #region WPF WebBroswer交互源代码DOM元素总结 #if RESEARCH //HTMLDocument doc01 = wb.Document as HTMLDocument; ////IHTMLDocument2 doc02 = wb.Document as IHTMLDocument2; //Debug.WriteLine(doc01.body.innerHTML); ///读/写元素 /// mshtml.IHTMLElement login_pass = (mshtml.IHTMLElement)doc2.all.item("login_pass", 0); mshtml.IHTMLElement password = (mshtml.IHTMLElement)doc2.all.item("password", 0); password.setAttribute("value", "12345678"); login_pass.setAttribute("style", ""); mshtml.IHTMLElement login_pass1 = (mshtml.IHTMLElement)doc2.all.item("login_pass1", 0); mshtml.IHTMLElement password1 = (mshtml.IHTMLElement)doc2.all.item("password1", 0); login_pass1.setAttribute("style", "display:none;"); //password1.setAttribute("style", "width:1px"); //IHTMLElement item = doc01.getElementById("ptmm"); //item.innerHTML = "<INPUT id=\"pwd\" class=\"login_input password\" type=\"text\" value=\"\" />"; //// doc01.body.insertAdjacentHTML(,); //MessageBox.Show(item.innerText); //wb.NavigateToString(doc01.body.innerHTML); /// Trigger event //点击确定按钮 loginBT.click(); /// script injection /// //Basic ds = new Basic(); //wb.ObjectForScripting = ds;//该对象可由显示在WebBrowser控件中的网页所包含的脚本代码访问 ///Levarage JS /// mshtml.IHTMLWindow2 win = (mshtml.IHTMLWindow2)doc2.parentWindow; win.execScript("Login('12345678', '', 1)", "javascript"); return; #endif #endregion #endregion #region 使用HtmlParser提取tbodyHtml Lexer lexer = new Lexer(tbodyHtml); hParser.Parser parser = new hParser.Parser(lexer); hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow)); NodeList nodeList = parser.Parse(filter); if (nodeList.Count == 0) { hasValidData = false; MessageBox.Show("没有符合要求的节点"); } else { for (int i = 0; i < nodeList.Count; i++) { //抓取一行 var tagTR = parserTR(nodeList[i]); #region 充填有效行 if (tagTR != null) { validRowList.Add(tagTR); } #endregion } parserValidTR(validRowList, IsUnConfirmChecked, ref list); } #endregion #region 使用HtmlParser提取HTML /* Lexer lexer = new Lexer(html); * hParser.Parser parser = new hParser.Parser(lexer); * hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow)); * NodeList nodeList = parser.Parse(filter); * if (nodeList.Count == 0) * MessageBox.Show("没有符合要求的节点"); * else * { * for (int i = 0; i < nodeList.Count; i++) * { * //抓取一行 * var tagTR = parserTR(nodeList[i]); * #region 充填有效行 * if (tagTR != null) * validRowList.Add(tagTR); #endregion * * } * * parserValidTR(validRowList, ref list); * * } */ #endregion #region 日志 & 导出 & 持久化 if (list == null || list.Count == 0) { MessageBox.Show("该页面上没有检测到预期数据"); hasValidData = false; } ImportInvoiceListDTO soap = new ImportInvoiceListDTO { List = list, Result = new ImportInvoiceResultDTO { Message = "CALLBACK", Status = 9 } }; Debug.Write(soap); #region Log if (this.IfLog == "1") { soap.List.ForEach(impinfo => { if (IsUnConfirmChecked) { LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 勾选标志{8} 操作时间{9}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag, impinfo.ChosenTime)); } else { LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 确认月份{8}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag)); } }); } #endregion if (this.IfCallWS == "1") { CallWS(soap); } Debug.Write("本页已同步完成,请点击下一页继续同步"); //FakeBusy(); #endregion }