/// <summary> /// Try to gather a title, a content, a bullet. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitleBulletCreditContent() { bool isTitleSet, isCreditSet, isContentSet, isBulletSet; isTitleSet = isCreditSet = isContentSet = isBulletSet = false; var titles = page.DocumentNode.SelectNodes("//td[@name='tbTituloDoc']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } var selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']"); if (selectBullets != null) { foreach (var bullet in selectBullets) { var node = bullet; if (node.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = node.InnerHtml.Replace("\r\n", "").Trim(); examinedDoc.Bullet.Add(bl); } } isBulletSet = true; } var nodeContent = page.DocumentNode.SelectSingleNode("//td[@class='Cuerpo_texto_nota_interior']"); if (nodeContent != null) { var txt = ""; var nodeContentChildren = nodeContent.ChildNodes; if (nodeContentChildren != null) { foreach (var p in nodeContentChildren) { if (p.Name.ToUpper() == "P") { txt += p.OuterHtml; } } if (txt != null && txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } } var nodeCredit = page.DocumentNode.SelectSingleNode("//table[@class='Credito_Periodista']"); if (nodeCredit != null) { var nodeCreditChildren = nodeCredit.ChildNodes; if (nodeCreditChildren != null && nodeCreditChildren.Count == 1) { var n0 = nodeCreditChildren[0]; var n0Children = n0.ChildNodes; if (n0Children != null && n0Children.Count == 6) { var author = n0Children[1].InnerText; var city = n0Children[3].InnerText; var displayData = n0Children[5].InnerText; if (author != null && city != null && displayData != null) { if (author != "" && city != "" && displayData != "") { examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName; examinedDoc.CreditText.AuthorText = author; examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName; examinedDoc.CreditText.CityText = city; examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName; examinedDoc.CreditText.DisplayDateText = displayData; isCreditSet = true; } } } } } if (isTitleSet && isContentSet && isBulletSet && isCreditSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_009; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Try to gather a title, a content, a bullet, and a side image. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitlePhotoContent() { bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoSet; isTitleSet = isContentSet = isBulletSet = isLeftPhotoSet = false; var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']"); if (selectBullets != null) { foreach (var bullet in selectBullets) { if (bullet.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = bullet.InnerHtml; examinedDoc.Bullet.Add(bl); } } isBulletSet = true; } var images = page.DocumentNode.SelectNodes("//td/img"); List<String> imageNameList = new List<String>(); if (images != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = imageNameList[0]; isLeftPhotoSet = true; } } // We are now about to try to catch the content paragraph. // It happens that there are two paths about it. // We start from Title node. var node = page.DocumentNode.SelectSingleNode("//span[@class='Titulo']"); if (node != null) { node = node.ParentNode.NextSibling; int numBR = 1; while (node != null && numBR <= 2) { node = node.NextSibling; if (node == null) { break; } if (node.Name.ToLower() == "br") { numBR++; if (numBR == 2) { break; } } } if (node != null) { node = node.NextSibling; } if (node != null) { while (node != null) { if (node.Name.ToLower() == "table") { break; } else { if (node.Name.ToLower() == "br") { break; } } node = node.NextSibling; } } if (node != null) { node = node.NextSibling; } var txt = ""; if (node != null) { txt = node.InnerHtml; } if (txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_007; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Try to gather a title, a content, a bullet, and a side image with a footer. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitleBulletPhotoFooterContent() { bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoFooterSet; isTitleSet = isContentSet = isBulletSet = isLeftPhotoFooterSet = false; var titles = page.DocumentNode.SelectNodes("//div[@class='Titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } HtmlNode documentTextNode = null; var selectBullets = page.DocumentNode.SelectNodes("//p[@class='Balas']/img"); if (selectBullets != null) { foreach (var bullet in selectBullets) { var node = bullet.NextSibling; if (node.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = node.InnerHtml.Replace("\r\n", "").Trim(); examinedDoc.Bullet.Add(bl); } } isBulletSet = true; if (selectBullets.Count != 0) { documentTextNode = selectBullets[selectBullets.Count - 1]; } } if (documentTextNode != null) { documentTextNode = documentTextNode.ParentNode.NextSibling; string txt = ""; while (documentTextNode != null) { txt += documentTextNode.OuterHtml; documentTextNode = documentTextNode.NextSibling; while (documentTextNode != null) { if (documentTextNode.Name.ToUpper() != "P") { documentTextNode = documentTextNode.NextSibling; } if (documentTextNode == null) { break; } if (documentTextNode.Name.ToUpper() == "P") { break; } } } if (txt != null && txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } var images = page.DocumentNode.SelectNodes("//td/img"); var footers = page.DocumentNode.SelectNodes("//td[@class='PieFoto']"); List<String> imageNameList = new List<String>(); if (images != null && footers != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { for (int i = 0; i < imageNameList.Count; i++) { PhotoRelated pr = new PhotoRelated(); pr.ImageName = MetadataName.OldDocImageName; pr.ImageSrc = imageNameList[i]; pr.FooterName = MetadataName.OldDocImageFooterName; pr.Footer = footers[i].InnerHtml; examinedDoc.PhotoRelated.Add(pr); } isLeftPhotoFooterSet = true; } } if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoFooterSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_008; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Get document bullets /// </summary> private void RetrieveBullets() { var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']"); if (selectBullets != null) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_001; foreach (var bullet in selectBullets) { if (bullet.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = bullet.InnerHtml; examinedDoc.Bullet.Add(bl); } } } else { selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']"); if (selectBullets != null) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_004; foreach (var bullet in selectBullets) { if (bullet.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = bullet.InnerHtml; examinedDoc.Bullet.Add(bl); } } } } }