Exemplo n.º 1
0
        public bool Parse(ref Documet document, HtmlDocument doc, string url)
        {
            bool bOk      = true;
            var  nodeData = doc.DocumentNode.SelectSingleNode("//table[@class='detailcontent']");

            if (nodeData == null)
            {
                bOk = false;
            }
            else
            {
                document.TextDoc = nodeData.InnerText;
                document.HtmlDoc = nodeData.InnerHtml;
                document.Id      = Common.CrcProductID(url);
                document.Url     = url;
                var nodeDetail =
                    doc.DocumentNode.SelectSingleNode(@"//table[@class='detailcontent']//tr/td/div[@align='justify']");
                var nodeParagrap = nodeDetail.SelectNodes(".//p");
                List <Tuple <HtmlNode, List <HtmlNode>, List <String> > > structurtData =
                    new List <Tuple <HtmlNode, List <HtmlNode>, List <String> > >();

                if (nodeParagrap == null)
                {
                    bOk = false;
                }
                else
                {
                    for (int i = 0; i < nodeParagrap.Count; i++)
                    {
                        var nodeCurrent = nodeParagrap[i];
                        var nodeTreeNav = nodeCurrent.SelectSingleNode(".//a[@name]");

                        if (nodeTreeNav != null &&
                            nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("chuong_"))
                        {
                            Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem =
                                new Tuple <HtmlNode, List <HtmlNode>, List <string> >(nodeTreeNav,
                                                                                      new List <HtmlNode>(), new List <string>());
                            structurtData.Add(newItem);
                        }
                        else if (structurtData.Count == 0 && nodeTreeNav != null &&
                                 nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("dieu_"))
                        {
                            Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem =
                                new Tuple <HtmlNode, List <HtmlNode>, List <string> >(
                                    new HtmlNode(HtmlNodeType.Element, doc, -1),
                                    new List <HtmlNode>(), new List <string>());
                            structurtData.Add(newItem);
                        }
                        else if (structurtData.Count > 0)
                        {
                            structurtData[structurtData.Count - 1].Item2.Add(nodeParagrap[i]);
                        }
                    }
                    document.LstStructure = structurtData;
                    document.ParseDieu();
                }
            }
            return(bOk);
        }
Exemplo n.º 2
0
        public override void ProcessMessage(BasicDeliverEventArgs message)
        {
            JobQueue job =
                Newtonsoft.Json.JsonConvert.DeserializeObject <JobQueue>(UTF8Encoding.UTF8.GetString(message.Body));
            string url = job.Url;


            if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*"))
            {
                long docId = Common.CrcProductID(url);
                if (bAutoDel || !_docManAdapter.CheckExistDoc(docId))
                {
                    var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2);
                    if (!string.IsNullOrEmpty(html))
                    {
                        html = System.Web.HttpUtility.HtmlDecode(html);
                        HtmlDocument htmlDocument = new HtmlDocument();
                        htmlDocument.LoadHtml(html);
                        Documet    document = new Documet();
                        ParserData p        = new ParserData();
                        bool       bOK      = p.Parse(ref document, htmlDocument, url);

                        if (bOK && document.IsValidData())
                        {
                            _docManAdapter.InsertData(document);
                        }
                    }
                    _log.Info(string.Format("{0} Success", url));

                    Thread.Sleep(1000);

                    //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001";
                    //HtmlDocument htmlDocumentInfo = new HtmlDocument();
                    //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));
                }
            }
            else
            {
                Logger.Info(string.Format("Fail regex {0}", job.Url));
            }
            this.GetChannel().BasicAck(message.DeliveryTag, true);
        }
Exemplo n.º 3
0
        public override void ProcessMessage(BasicDeliverEventArgs message)
        {
            JobResetDocInfo job =
                Newtonsoft.Json.JsonConvert.DeserializeObject <JobResetDocInfo>(UTF8Encoding.UTF8.GetString(message.Body));
            string url = job.Url;

            if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*"))
            {
                string id = Regex.Match(job.Url, @"ItemID=\d+").Groups[0].Captures[0].Value.Replace("ItemID=", "");
                url = string.Format(@"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID={0}", id);
                var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2);
                if (!string.IsNullOrEmpty(html))
                {
                    html = System.Web.HttpUtility.HtmlDecode(html);
                    HtmlDocument htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(html);
                    Documet document = new Documet();

                    ParserData p       = new ParserData();
                    var        docInfo = p.ParseInfoDoc(htmlDocument, url);

                    if (docInfo != null)
                    {
                        docInfo.Id = job.Id;
                        _docManAdapter.UpdateDocData(docInfo);
                    }
                }
                _log.Info(string.Format("{0} Success", url));

                Thread.Sleep(1000);

                //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001";
                //HtmlDocument htmlDocumentInfo = new HtmlDocument();
                //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));
            }
            else
            {
                Logger.Info(string.Format("Fail regex {0}", job.Url));
            }
            this.GetChannel().BasicAck(message.DeliveryTag, true);
        }
Exemplo n.º 4
0
        public void InsertData(Documet document)
        {
            const string strData = @"

Delete From Documents where Id = @Id
Delete From Menu Where DocumentId = @Id
Delete From DocChapter where DocumentId = @Id

Insert 
Into Documents (Id, TextDoc, HtmlDoc, Url) Values (@Id, @TextDoc, @HtmlDoc, @Url);

Select Top 1 Id From Documents Order By Id DESC
";
            DataTable    tbl     = this.sqlDb.GetTblData(strData, CommandType.Text, new SqlParameter[]
            {
                SqlDb.CreateParamteterSQL("@TextDoc", document.TextDoc, SqlDbType.NText),
                SqlDb.CreateParamteterSQL("@HtmlDoc", document.HtmlDoc, SqlDbType.NText),
                SqlDb.CreateParamteterSQL("@Id", document.Id, SqlDbType.BigInt),
                SqlDb.CreateParamteterSQL("@Url", document.Url, SqlDbType.NVarChar),
            });

            var lstMenu = document.LstStructure;

            if (document.LstStructure != null && document.LstStructure.Count > 0)
            {
                foreach (var variable in lstMenu)
                {
                    const string str = "Insert Into Menu (ParentId, Text, Ref, DocumentId) Values (@ParentId, @Text, @Ref, @DocumentId) " +
                                       "Select Top 1 Id From Menu Order By Id Desc";

                    DataTable tbl1 = this.sqlDb.GetTblData(str, CommandType.Text, new SqlParameter[]
                    {
                        SqlDb.CreateParamteterSQL("Text",
                                                  (variable.Item1 == null) ? "" : variable.Item1.InnerText.Trim(),
                                                  SqlDbType.NVarChar),
                        SqlDb.CreateParamteterSQL("ParentId", 0, SqlDbType.Int),
                        SqlDb.CreateParamteterSQL("DocumentId", document.Id, SqlDbType.BigInt),
                        SqlDb.CreateParamteterSQL("Ref",
                                                  (variable.Item1.Name == "")
                                ? ""
                                : variable.Item1.GetAttributeValue("name", ""),
                                                  SqlDbType.NVarChar),
                    });


                    long   chuongId   = Convert.ToInt64(tbl1.Rows[0]["Id"]);
                    string strChapter = "Insert Into DocChapter (IdMenu,  Text, Html, DocumentId) Values (@IdMenu, @Text, @Html, @DocumentId)";

                    this.sqlDb.RunQuery(strChapter, CommandType.Text, new SqlParameter[]
                    {
                        SqlDb.CreateParamteterSQL("@IdMenu", chuongId, SqlDbType.BigInt),
                        SqlDb.CreateParamteterSQL("@DocumentId", document.Id, SqlDbType.BigInt),
                        SqlDb.CreateParamteterSQL("@Text",
                                                  string.Join("", variable.Item2.Select(a => a.InnerText).ToList()), SqlDbType.NText),
                        SqlDb.CreateParamteterSQL("@Html",
                                                  string.Join("", variable.Item2.Select(a => a.OuterHtml).ToList()), SqlDbType.NText)
                    });

                    foreach (var VARIABLE in variable.Item3)
                    {
                        string strQ = "Insert Into Menu (ParentId, Ref, DocumentId) Values (@ParentId, @Ref, @DocumentId)";
                        this.sqlDb.RunQuery(strQ, CommandType.Text, new[]
                        {
                            SqlDb.CreateParamteterSQL("ParentId", chuongId, SqlDbType.BigInt),
                            SqlDb.CreateParamteterSQL("Ref", VARIABLE, SqlDbType.NVarChar),
                            SqlDb.CreateParamteterSQL("DocumentId", document.Id, SqlDbType.BigInt),
                        });
                    }
                }
            }
        }
Exemplo n.º 5
0
 private void Parser_OnNewData(object arg1, Documet arg2)
 {
     listParserDocument.Add(arg2);
 }
Exemplo n.º 6
0
        private static void Main(string[] args)
        {
            string str = "1. PushLinkParse. 2. RunWorkerProcess. 3.PushDocInfo. 4.WorkerDocInfo";

            Console.WriteLine(str);
            int a = Convert.ToInt32(Console.ReadLine());

            switch (a)
            {
            case 1: Test t = new Test();
                t.PushQueueAs();
                break;

            case 3:
                Test t1 = new Test();
                t1.PushQueueAsDocInfo();
                break;

            case 2:
                var v = new WorkerCrawler();
                v.StartConsume();
                break;

            case 4:
                var v1 = new WorkerDocInfo();
                v1.StartConsume();
                break;
            }
            return;

            //var v = new WorkerCrawler();
            //v.StartConsume();
            //return;



            ////    Test t = new Test();
            ////    t.PushQueueAs();
            DocManAdapter docManAdapter = new DocManAdapter();
            string        url           = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=30517";

            url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=6527";
            //url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=20516#Chuong_I";

            url = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=8070";

            string       html         = System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));
            HtmlDocument htmlDocument = new HtmlDocument();

            html = Common.ChuanHoaTextOfHtml(html);
            htmlDocument.LoadHtml(html);

            htmlDocument.DocumentNode.Descendants()
            .Where(n => n.Name == "script" || n.Name == "style")
            .ToList()
            .ForEach(n => n.Remove());

            Documet document = new Documet();

            //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001";
            //HtmlDocument htmlDocumentInfo = new HtmlDocument();
            //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2));

            ParserData p       = new ParserData();
            DocInfo    docInfo = new DocInfo();
            //p.Parse(ref document, htmlDocument, url);
            DocInfo di = p.ParseInfoDoc(htmlDocument, url);

            Console.Write(di);
            if (document.IsValidData())
            {
                docManAdapter.InsertData(document);
            }
        }