public override void ProcessMessage(BasicDeliverEventArgs message) { JobQueue job = Newtonsoft.Json.JsonConvert.DeserializeObject <JobQueue>(UTF8Encoding.UTF8.GetString(message.Body)); string url = job.Url; if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*")) { long docId = Common.CrcProductID(url); if (bAutoDel || !_docManAdapter.CheckExistDoc(docId)) { var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2); if (!string.IsNullOrEmpty(html)) { html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); Documet document = new Documet(); ParserData p = new ParserData(); bool bOK = p.Parse(ref document, htmlDocument, url); if (bOK && document.IsValidData()) { _docManAdapter.InsertData(document); } } _log.Info(string.Format("{0} Success", url)); Thread.Sleep(1000); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); } } else { Logger.Info(string.Format("Fail regex {0}", job.Url)); } this.GetChannel().BasicAck(message.DeliveryTag, true); }
public override void ProcessMessage(BasicDeliverEventArgs message) { JobResetDocInfo job = Newtonsoft.Json.JsonConvert.DeserializeObject <JobResetDocInfo>(UTF8Encoding.UTF8.GetString(message.Body)); string url = job.Url; if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*")) { string id = Regex.Match(job.Url, @"ItemID=\d+").Groups[0].Captures[0].Value.Replace("ItemID=", ""); url = string.Format(@"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID={0}", id); var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2); if (!string.IsNullOrEmpty(html)) { html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); Documet document = new Documet(); ParserData p = new ParserData(); var docInfo = p.ParseInfoDoc(htmlDocument, url); if (docInfo != null) { docInfo.Id = job.Id; _docManAdapter.UpdateDocData(docInfo); } } _log.Info(string.Format("{0} Success", url)); Thread.Sleep(1000); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); } else { Logger.Info(string.Format("Fail regex {0}", job.Url)); } this.GetChannel().BasicAck(message.DeliveryTag, true); }
private static void Main(string[] args) { string str = "1. PushLinkParse. 2. RunWorkerProcess. 3.PushDocInfo. 4.WorkerDocInfo"; Console.WriteLine(str); int a = Convert.ToInt32(Console.ReadLine()); switch (a) { case 1: Test t = new Test(); t.PushQueueAs(); break; case 3: Test t1 = new Test(); t1.PushQueueAsDocInfo(); break; case 2: var v = new WorkerCrawler(); v.StartConsume(); break; case 4: var v1 = new WorkerDocInfo(); v1.StartConsume(); break; } return; //var v = new WorkerCrawler(); //v.StartConsume(); //return; //// Test t = new Test(); //// t.PushQueueAs(); DocManAdapter docManAdapter = new DocManAdapter(); string url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=30517"; url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=6527"; //url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=20516#Chuong_I"; url = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=8070"; string html = System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); HtmlDocument htmlDocument = new HtmlDocument(); html = Common.ChuanHoaTextOfHtml(html); htmlDocument.LoadHtml(html); htmlDocument.DocumentNode.Descendants() .Where(n => n.Name == "script" || n.Name == "style") .ToList() .ForEach(n => n.Remove()); Documet document = new Documet(); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); ParserData p = new ParserData(); DocInfo docInfo = new DocInfo(); //p.Parse(ref document, htmlDocument, url); DocInfo di = p.ParseInfoDoc(htmlDocument, url); Console.Write(di); if (document.IsValidData()) { docManAdapter.InsertData(document); } }