public bool Parse(ref Documet document, HtmlDocument doc, string url) { bool bOk = true; var nodeData = doc.DocumentNode.SelectSingleNode("//table[@class='detailcontent']"); if (nodeData == null) { bOk = false; } else { document.TextDoc = nodeData.InnerText; document.HtmlDoc = nodeData.InnerHtml; document.Id = Common.CrcProductID(url); document.Url = url; var nodeDetail = doc.DocumentNode.SelectSingleNode(@"//table[@class='detailcontent']//tr/td/div[@align='justify']"); var nodeParagrap = nodeDetail.SelectNodes(".//p"); List <Tuple <HtmlNode, List <HtmlNode>, List <String> > > structurtData = new List <Tuple <HtmlNode, List <HtmlNode>, List <String> > >(); if (nodeParagrap == null) { bOk = false; } else { for (int i = 0; i < nodeParagrap.Count; i++) { var nodeCurrent = nodeParagrap[i]; var nodeTreeNav = nodeCurrent.SelectSingleNode(".//a[@name]"); if (nodeTreeNav != null && nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("chuong_")) { Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem = new Tuple <HtmlNode, List <HtmlNode>, List <string> >(nodeTreeNav, new List <HtmlNode>(), new List <string>()); structurtData.Add(newItem); } else if (structurtData.Count == 0 && nodeTreeNav != null && nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("dieu_")) { Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem = new Tuple <HtmlNode, List <HtmlNode>, List <string> >( new HtmlNode(HtmlNodeType.Element, doc, -1), new List <HtmlNode>(), new List <string>()); structurtData.Add(newItem); } else if (structurtData.Count > 0) { structurtData[structurtData.Count - 1].Item2.Add(nodeParagrap[i]); } } document.LstStructure = structurtData; document.ParseDieu(); } } return(bOk); }
public override void ProcessMessage(BasicDeliverEventArgs message) { JobQueue job = Newtonsoft.Json.JsonConvert.DeserializeObject <JobQueue>(UTF8Encoding.UTF8.GetString(message.Body)); string url = job.Url; if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*")) { long docId = Common.CrcProductID(url); if (bAutoDel || !_docManAdapter.CheckExistDoc(docId)) { var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2); if (!string.IsNullOrEmpty(html)) { html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); Documet document = new Documet(); ParserData p = new ParserData(); bool bOK = p.Parse(ref document, htmlDocument, url); if (bOK && document.IsValidData()) { _docManAdapter.InsertData(document); } } _log.Info(string.Format("{0} Success", url)); Thread.Sleep(1000); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); } } else { Logger.Info(string.Format("Fail regex {0}", job.Url)); } this.GetChannel().BasicAck(message.DeliveryTag, true); }
public override void ProcessMessage(BasicDeliverEventArgs message) { JobResetDocInfo job = Newtonsoft.Json.JsonConvert.DeserializeObject <JobResetDocInfo>(UTF8Encoding.UTF8.GetString(message.Body)); string url = job.Url; if (Regex.IsMatch(job.Url, @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx\?ItemID=\d.*")) { string id = Regex.Match(job.Url, @"ItemID=\d+").Groups[0].Captures[0].Value.Replace("ItemID=", ""); url = string.Format(@"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID={0}", id); var html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2); if (!string.IsNullOrEmpty(html)) { html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); Documet document = new Documet(); ParserData p = new ParserData(); var docInfo = p.ParseInfoDoc(htmlDocument, url); if (docInfo != null) { docInfo.Id = job.Id; _docManAdapter.UpdateDocData(docInfo); } } _log.Info(string.Format("{0} Success", url)); Thread.Sleep(1000); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); } else { Logger.Info(string.Format("Fail regex {0}", job.Url)); } this.GetChannel().BasicAck(message.DeliveryTag, true); }
public void InsertData(Documet document) { const string strData = @" Delete From Documents where Id = @Id Delete From Menu Where DocumentId = @Id Delete From DocChapter where DocumentId = @Id Insert Into Documents (Id, TextDoc, HtmlDoc, Url) Values (@Id, @TextDoc, @HtmlDoc, @Url); Select Top 1 Id From Documents Order By Id DESC "; DataTable tbl = this.sqlDb.GetTblData(strData, CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@TextDoc", document.TextDoc, SqlDbType.NText), SqlDb.CreateParamteterSQL("@HtmlDoc", document.HtmlDoc, SqlDbType.NText), SqlDb.CreateParamteterSQL("@Id", document.Id, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@Url", document.Url, SqlDbType.NVarChar), }); var lstMenu = document.LstStructure; if (document.LstStructure != null && document.LstStructure.Count > 0) { foreach (var variable in lstMenu) { const string str = "Insert Into Menu (ParentId, Text, Ref, DocumentId) Values (@ParentId, @Text, @Ref, @DocumentId) " + "Select Top 1 Id From Menu Order By Id Desc"; DataTable tbl1 = this.sqlDb.GetTblData(str, CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("Text", (variable.Item1 == null) ? "" : variable.Item1.InnerText.Trim(), SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("ParentId", 0, SqlDbType.Int), SqlDb.CreateParamteterSQL("DocumentId", document.Id, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("Ref", (variable.Item1.Name == "") ? "" : variable.Item1.GetAttributeValue("name", ""), SqlDbType.NVarChar), }); long chuongId = Convert.ToInt64(tbl1.Rows[0]["Id"]); string strChapter = "Insert Into DocChapter (IdMenu, Text, Html, DocumentId) Values (@IdMenu, @Text, @Html, @DocumentId)"; this.sqlDb.RunQuery(strChapter, CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@IdMenu", chuongId, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@DocumentId", document.Id, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@Text", string.Join("", variable.Item2.Select(a => a.InnerText).ToList()), SqlDbType.NText), SqlDb.CreateParamteterSQL("@Html", string.Join("", variable.Item2.Select(a => a.OuterHtml).ToList()), SqlDbType.NText) }); foreach (var VARIABLE in variable.Item3) { string strQ = "Insert Into Menu (ParentId, Ref, DocumentId) Values (@ParentId, @Ref, @DocumentId)"; this.sqlDb.RunQuery(strQ, CommandType.Text, new[] { SqlDb.CreateParamteterSQL("ParentId", chuongId, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("Ref", VARIABLE, SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("DocumentId", document.Id, SqlDbType.BigInt), }); } } } }
private void Parser_OnNewData(object arg1, Documet arg2) { listParserDocument.Add(arg2); }
private static void Main(string[] args) { string str = "1. PushLinkParse. 2. RunWorkerProcess. 3.PushDocInfo. 4.WorkerDocInfo"; Console.WriteLine(str); int a = Convert.ToInt32(Console.ReadLine()); switch (a) { case 1: Test t = new Test(); t.PushQueueAs(); break; case 3: Test t1 = new Test(); t1.PushQueueAsDocInfo(); break; case 2: var v = new WorkerCrawler(); v.StartConsume(); break; case 4: var v1 = new WorkerDocInfo(); v1.StartConsume(); break; } return; //var v = new WorkerCrawler(); //v.StartConsume(); //return; //// Test t = new Test(); //// t.PushQueueAs(); DocManAdapter docManAdapter = new DocManAdapter(); string url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=30517"; url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=6527"; //url = @"http://moj.gov.vn/vbpq/Lists/Vn%20bn%20php%20lut/View_Detail.aspx?ItemID=20516#Chuong_I"; url = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=8070"; string html = System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); HtmlDocument htmlDocument = new HtmlDocument(); html = Common.ChuanHoaTextOfHtml(html); htmlDocument.LoadHtml(html); htmlDocument.DocumentNode.Descendants() .Where(n => n.Name == "script" || n.Name == "style") .ToList() .ForEach(n => n.Remove()); Documet document = new Documet(); //string urlInfo = @"http://moj.gov.vn/vbpq/Pages/View_Propertes.aspx?ItemID=3001"; //HtmlDocument htmlDocumentInfo = new HtmlDocument(); //htmlDocumentInfo.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 42, 2)); ParserData p = new ParserData(); DocInfo docInfo = new DocInfo(); //p.Parse(ref document, htmlDocument, url); DocInfo di = p.ParseInfoDoc(htmlDocument, url); Console.Write(di); if (document.IsValidData()) { docManAdapter.InsertData(document); } }