Beispiel #1
0
        public override void ProcessMessage(BasicDeliverEventArgs message)
        {
            JobQueue job =
                Newtonsoft.Json.JsonConvert.DeserializeObject <JobQueue>(UTF8Encoding.UTF8.GetString(message.Body));
            string url = job.Url;


            if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*"))
            {
                long docId = Common.CrcProductID(url);
                if (bAutoDel || !_docManAdapter.CheckExistDoc(docId))
                {
                    var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2);
                    if (!string.IsNullOrEmpty(html))
                    {
                        html = System.Web.HttpUtility.HtmlDecode(html);
                        HtmlDocument htmlDocument = new HtmlDocument();
                        htmlDocument.LoadHtml(html);
                        Documet    document = new Documet();
                        ParserData p        = new ParserData();
                        bool       bOK      = p.Parse(ref document, htmlDocument, url);

                        if (bOK && document.IsValidData())
                        {
                            _docManAdapter.InsertData(document);
                        }
                    }
                    _log.Info(string.Format("{0} Success", url));

                    Thread.Sleep(1000);

                    //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001";
                    //HtmlDocument htmlDocumentInfo = new HtmlDocument();
                    //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));
                }
            }
            else
            {
                Logger.Info(string.Format("Fail regex {0}", job.Url));
            }
            this.GetChannel().BasicAck(message.DeliveryTag, true);
        }
Beispiel #2
0
        private static void Main(string[] args)
        {
            string str = "1. PushLinkParse. 2. RunWorkerProcess. 3.PushDocInfo. 4.WorkerDocInfo";

            Console.WriteLine(str);
            int a = Convert.ToInt32(Console.ReadLine());

            switch (a)
            {
            case 1: Test t = new Test();
                t.PushQueueAs();
                break;

            case 3:
                Test t1 = new Test();
                t1.PushQueueAsDocInfo();
                break;

            case 2:
                var v = new WorkerCrawler();
                v.StartConsume();
                break;

            case 4:
                var v1 = new WorkerDocInfo();
                v1.StartConsume();
                break;
            }
            return;

            //var v = new WorkerCrawler();
            //v.StartConsume();
            //return;



            ////    Test t = new Test();
            ////    t.PushQueueAs();
            DocManAdapter docManAdapter = new DocManAdapter();
            string        url           = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=30517";

            url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=6527";
            //url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=20516#Chuong_I";

            url = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=8070";

            string       html         = System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));
            HtmlDocument htmlDocument = new HtmlDocument();

            html = Common.ChuanHoaTextOfHtml(html);
            htmlDocument.LoadHtml(html);

            htmlDocument.DocumentNode.Descendants()
            .Where(n => n.Name == "script" || n.Name == "style")
            .ToList()
            .ForEach(n => n.Remove());

            Documet document = new Documet();

            //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001";
            //HtmlDocument htmlDocumentInfo = new HtmlDocument();
            //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));

            ParserData p       = new ParserData();
            DocInfo    docInfo = new DocInfo();
            //p.Parse(ref document, htmlDocument, url);
            DocInfo di = p.ParseInfoDoc(htmlDocument, url);

            Console.Write(di);
            if (document.IsValidData())
            {
                docManAdapter.InsertData(document);
            }
        }