Ejemplo n.º 1
0
        private void paserData(INode node, string url, int routeId)
        {
            Parser     parser  = Parser.CreateParser(GetHtmlStr(url), "utf-8");
            NodeFilter filter  = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "clearfix"));
            NodeList   msgList = parser.Parse(filter);

            //saveMessage(msgList, routeId);

            parser.Reset();
            NodeFilter pagefilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "review-menu clearfix"));
            NodeList   pageList   = parser.Parse(pagefilter);

            parser = Parser.CreateParser(pageList.ToHtml(), "utf-8");
            NodeFilter filterA = new NodeClassFilter(typeof(ATag));
            NodeList   pgList  = parser.Parse(filterA);

            for (int i = 1; i < pgList.Count; i++)
            {
                string url2 = "http://www.yododo.com" + getTag(pgList[i]).GetAttribute("href");
                parser = Parser.CreateParser(GetHtmlStr(url2), "utf-8");
                //NodeFilter filter = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "clearfix"));
                msgList = parser.Parse(filter);
                //saveMessage(msgList, routeId);
                if (i == 3)
                {
                    break;
                }
            }
        }
Ejemplo n.º 2
0
        public LinkStatus ExtractLinks()
        {
            if (String.Empty == m_strUrl)
            {
                throw new ArgumentException("No URL specified");
            }

            m_Links = new LinkDataCollection();
            CreateParser();
            if (m_obParser.Lexer.Page.mSource == null)
            {
                return(LinkStatus.Broken);
            }

            NodeFilter obFilter  = new NodeClassFilter(typeof(ATag));
            NodeList   collNodes = m_obParser.Parse(obFilter);

            if (null != collNodes)
            {
                PageData obPageData = new PageData();
                obPageData.m_strUrl = m_obParser.URL;
                obPageData.m_iDepth = m_iLevel;
                for (Int32 i = 0; i < collNodes.Count; i++)
                {
                    INode    obNode     = collNodes[i];
                    LinkData obLinkData = new LinkData(obPageData, obNode as ATag);
                    m_Links.Add(obLinkData);
                }
            }
            return(LinkStatus.Ok);
        }
Ejemplo n.º 3
0
        /// <summary> Search given node and pick up any objects of given type.</summary>
        /// <param name="node">The node to search.
        /// </param>
        /// <param name="type">The class to search for.
        /// </param>
        /// <returns> A node array with the matching nodes.
        /// </returns>
        public static INode[] FindTypeInNode(INode node, System.Type type)
        {
            INodeFilter filter;
            NodeList    ret;

            ret    = new NodeList();
            filter = new NodeClassFilter(type);
            node.CollectInto(ret, filter);

            return(ret.ToNodeArray());
        }
Ejemplo n.º 4
0
 /// <summary>
 /// Convert back to service model
 /// </summary>
 /// <returns></returns>
 public BrowseRequestModel ToServiceModel()
 {
     return(new BrowseRequestModel {
         NodeIdsOnly = NodeIdsOnly,
         NodeId = NodeId,
         MaxReferencesToReturn = MaxReferencesToReturn,
         Direction = Direction,
         View = View?.ToServiceModel(),
         NodeClassFilter = NodeClassFilter?.ToList(),
         ReferenceTypeId = ReferenceTypeId,
         TargetNodesOnly = TargetNodesOnly,
         ReadVariableValues = ReadVariableValues,
         NoSubtypes = NoSubtypes,
         Header = Header?.ToServiceModel()
     });
 }
Ejemplo n.º 5
0
        private static void MakeFilters()
        {
            NodeClassFilter    dlFilter         = new NodeClassFilter(typeof(DefinitionList));
            HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList");

            poiListFilter     = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter);
            poiFilter         = new NodeClassFilter(typeof(DefinitionListBullet));
            tasteFilter       = new HasAttributeFilter("class", "score1");
            environmentFilter = new HasAttributeFilter("class", "score2");
            serviceFilter     = new HasAttributeFilter("class", "score3");
            averageFilter     = new HasAttributeFilter("class", "average");
            commentFilter     = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview"));
            HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname");

            nameFilter    = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL"));
            addressFilter = new HasAttributeFilter("class", "address");
            tagsFilter    = new HasAttributeFilter("class", "tags");
        }
        /* private void WebBox_SourceUpdated(object sender, DataTransferEventArgs e)
         * {
         *
         * }*/

        private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            this.parseResult = "";
            Uri uri = this.webBox.Source;

            #region

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = (mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0? s_htmlFake: doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer      lexer    = new Lexer(html);
            Parser     parser   = new Parser(lexer);
            NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList   nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    parserTR(nodeList[i]);
                }
                MessageBox.Show(parseResult);
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Gets all links contained in the page
        /// </summary>
        /// <returns></returns>
        public NodeList GetAllOutLinks()
        {
            INodeFilter filter = new NodeClassFilter(typeof(LinkTag));

            return(this.Parse(filter));
        }
        public void StartCrawl()//  private void BtnDownload_Click(object sender, RoutedEventArgs e)
        {
            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            this.parseResult = "";
            Uri uri = new Uri(this.TargetUri);

            #region <N>基于Httphelper,这样下载会要求程序自己实现验证授权

            //<N>基于Httphelper,这样下载会要求程序自己实现验证授权
            //HttpHelper httpHelper = new HttpHelper();
            //HttpItem rq = new HttpItem();
            //rq.URL = uri.AbsoluteUri;
            //HttpResult html = httpHelper.GetHtml(rq);
            //Debug.WriteLine(html.Html);

            //直接基于WebBrowser,授权是由用户手动实现的
            mshtml.IHTMLDocument2 doc2 = null;//(mshtml.IHTMLDocument2)webBox.Document;
            string html = string.Compare(this.IsOffline, "1", StringComparison.InvariantCultureIgnoreCase) == 0 ? s_htmlFake : doc2.body.innerHTML;
            Debug.WriteLine(html);
            #endregion

            #region 使用HtmlParser提取HTML
            Lexer              lexer    = new Lexer(html);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, ref list);
#if DEBUG
                // MessageBox.Show(parseResult);
#endif
            }

            /*  parseResult = HtmlText(html);
             * MessageBox.Show(parseResult);*/
            #endregion

            #region  步

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                return;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };
            //using (var factory = new ChannelFactory<ISyncImportInvoiceService>("*"))
            //{
            //    var chl = factory.CreateChannel();
            //    soap = chl.PullImportInvoices(soap);

            //    if (soap.Result.Status == 0)
            //    {
            //        //重试
            //        soap = chl.PullImportInvoices(soap);
            //    }
            //}

            //if (soap.Result.Status == -1)
            //{
            //    // 修改UI线程
            //      MessageBox.Show(soap.Result.Message);
            //}
            CallWS(soap);
            MessageBox.Show("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }
Ejemplo n.º 9
0
        private void GrapBaiduMsg(HttpContext context)
        {
            string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/");

            ClassLibrary.BLL.RouteClass          rcBll  = new ClassLibrary.BLL.RouteClass();
            List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3");

            Parser     parser   = Parser.CreateParser(sRslt, "utf-8");
            NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
            NodeList   liList   = parser.Parse(filterUL);
            string     links    = liList[0].ToHtml();

            parser = Parser.CreateParser(links, "utf-8");
            NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag));
            NodeList   nodelist = parser.Parse(filterLI);

            //string strGn = nodelist[1].ToHtml();
            string strCj = nodelist[0].ToHtml();

            //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8");
            NodeFilter filterA = new NodeClassFilter(typeof(ATag));

            /*NodeList aGnList = parser.Parse(filterA);
             * for (int i = 0; i < aGnList.Count; i++)
             * {
             *  ITag tag = getTag(aGnList[i]);
             *  string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1";  //已解决
             *  string className = tag.ToPlainTextString();
             *  if (className == "全部") continue;
             *
             *  ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; });
             *  if (model == null) continue;
             *
             *  paserData(aGnList[i], url, model.ID);
             * }*/

            parser = Parser.CreateParser(strCj, "utf-8");
            NodeList areaCjList = parser.Parse(filterA);

            for (int i = 0; i < areaCjList.Count; i++)
            {
                ITag   tag       = getTag(areaCjList[i]);
                string url       = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲
                string className = tag.ToPlainTextString();
                if (className == "全部" || className == "中国")
                {
                    continue;
                }

                parser = Parser.CreateParser(GetHtmlStr(url), "utf-8");
                //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
                NodeList liListCj = parser.Parse(filterUL);
                string   linksCj  = liListCj[0].ToHtml();

                parser = Parser.CreateParser(linksCj, "utf-8");
                //NodeFilter filterA = new NodeClassFilter(typeof(ATag));
                NodeList aCjList = parser.Parse(filterA);
                for (int j = 0; j < aCjList.Count; j++)
                {
                    ITag   cjtag      = getTag(aCjList[j]);
                    string url1       = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决
                    string className1 = cjtag.ToPlainTextString();
                    if (className1 == "全部")
                    {
                        continue;
                    }

                    ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); });
                    if (model == null)
                    {
                        continue;
                    }

                    paserData(aCjList[j], url1, model.ID);
                }
            }

            Print(context, "success");
        }
        private void CrawlCurrentPage(WebBrowser wb, bool isOffline, bool IsUnConfirmChecked, ref bool hasValidData)
        {
            mshtml.IHTMLDocument2 doc2 = isOffline ? null : (mshtml.IHTMLDocument2)wb.Document;
            string html = isOffline ? s_htmlFake : doc2.body.innerHTML;


            Debug.WriteLine(html);


            List <ImportInvoiceDTO>      list         = new List <ImportInvoiceDTO>();
            List <hParser.Tags.TableRow> validRowList = new List <hParser.Tags.TableRow>();

            //this.parseResult = "";

            #region  使用IHTMLDocument2提取HTML

            mshtml.HTMLTableClass table = IsUnConfirmChecked ? (mshtml.HTMLTableClass)doc2.all.item("example1", 0) : (mshtml.HTMLTableClass)doc2.all.item("example", 0);
            if (table == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效table");
                return;
            }
            mshtml.HTMLTableSectionClass tbody = (mshtml.HTMLTableSectionClass)table.lastChild;
            if (tbody == null)
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }

            var tbodyHtml = tbody.innerHTML;

            if (0 == string.Compare(tbody.innerText, "没找到记录", StringComparison.InvariantCultureIgnoreCase))
            {
                hasValidData = false;
                //throw new InvalidOperationException("无效tbody");
                return;
            }
            #region WPF WebBroswer交互源代码DOM元素总结

#if RESEARCH
            //HTMLDocument doc01 = wb.Document as HTMLDocument;
            ////IHTMLDocument2 doc02 = wb.Document as IHTMLDocument2;
            //Debug.WriteLine(doc01.body.innerHTML);


            ///读/写元素
            ///
            mshtml.IHTMLElement login_pass = (mshtml.IHTMLElement)doc2.all.item("login_pass", 0);
            mshtml.IHTMLElement password   = (mshtml.IHTMLElement)doc2.all.item("password", 0);
            password.setAttribute("value", "12345678");
            login_pass.setAttribute("style", "");

            mshtml.IHTMLElement login_pass1 = (mshtml.IHTMLElement)doc2.all.item("login_pass1", 0);
            mshtml.IHTMLElement password1   = (mshtml.IHTMLElement)doc2.all.item("password1", 0);
            login_pass1.setAttribute("style", "display:none;");
            //password1.setAttribute("style", "width:1px");

            //IHTMLElement item = doc01.getElementById("ptmm");
            //item.innerHTML = "<INPUT id=\"pwd\" class=\"login_input password\" type=\"text\" value=\"\" />";

            ////  doc01.body.insertAdjacentHTML(,);
            //MessageBox.Show(item.innerText);

            //wb.NavigateToString(doc01.body.innerHTML);

            /// Trigger event
            //点击确定按钮
            loginBT.click();


            /// script injection
            ///
            //Basic ds = new Basic();
            //wb.ObjectForScripting = ds;//该对象可由显示在WebBrowser控件中的网页所包含的脚本代码访问

            ///Levarage JS
            ///
            mshtml.IHTMLWindow2 win = (mshtml.IHTMLWindow2)doc2.parentWindow;
            win.execScript("Login('12345678', '', 1)", "javascript");
            return;
#endif


            #endregion

            #endregion
            #region 使用HtmlParser提取tbodyHtml
            Lexer              lexer    = new Lexer(tbodyHtml);
            hParser.Parser     parser   = new hParser.Parser(lexer);
            hParser.NodeFilter filter   = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
            NodeList           nodeList = parser.Parse(filter);
            if (nodeList.Count == 0)
            {
                hasValidData = false;
                MessageBox.Show("没有符合要求的节点");
            }
            else
            {
                for (int i = 0; i < nodeList.Count; i++)
                {
                    //抓取一行
                    var tagTR = parserTR(nodeList[i]);

                    #region 充填有效行
                    if (tagTR != null)
                    {
                        validRowList.Add(tagTR);
                    }
                    #endregion
                }

                parserValidTR(validRowList, IsUnConfirmChecked, ref list);
            }

            #endregion
            #region 使用HtmlParser提取HTML

            /* Lexer lexer = new Lexer(html);
             * hParser.Parser parser = new hParser.Parser(lexer);
             * hParser.NodeFilter filter = new NodeClassFilter(typeof(Winista.Text.HtmlParser.Tags.TableRow));
             * NodeList nodeList = parser.Parse(filter);
             * if (nodeList.Count == 0)
             *  MessageBox.Show("没有符合要求的节点");
             * else
             * {
             *  for (int i = 0; i < nodeList.Count; i++)
             *  {
             *      //抓取一行
             *      var tagTR = parserTR(nodeList[i]);
             *
             #region 充填有效行
             *      if (tagTR != null)
             *          validRowList.Add(tagTR);
             #endregion
             *
             *  }
             *
             *  parserValidTR(validRowList, ref list);
             *
             * }
             */
            #endregion

            #region 日志 & 导出 & 持久化

            if (list == null || list.Count == 0)
            {
                MessageBox.Show("该页面上没有检测到预期数据");
                hasValidData = false;
            }

            ImportInvoiceListDTO soap = new ImportInvoiceListDTO
            {
                List   = list,
                Result = new ImportInvoiceResultDTO
                {
                    Message = "CALLBACK",
                    Status  = 9
                }
            };

            Debug.Write(soap);
            #region Log
            if (this.IfLog == "1")
            {
                soap.List.ForEach(impinfo =>
                {
                    if (IsUnConfirmChecked)
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 勾选标志{8} 操作时间{9}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag, impinfo.ChosenTime));
                    }
                    else
                    {
                        LogHelper.WriteLog(typeof(WebBoxView), string.Format("发票代码{0} 发票号码{1} 开票日期{2} 销方税号{3} 金额{4} 税额{5} 来源{6} 发票状态{7} 确认月份{8}", impinfo.InvoiceCode, impinfo.InvoiceNumber, impinfo.CreateDate, impinfo.SalesTaxNumber, impinfo.Amount, impinfo.Tax, impinfo.From, impinfo.Status, impinfo.SelectTag));
                    }
                });
            }
            #endregion
            if (this.IfCallWS == "1")
            {
                CallWS(soap);
            }
            Debug.Write("本页已同步完成,请点击下一页继续同步");
            //FakeBusy();

            #endregion
        }