/// <summary> /// Get page titles /// </summary> private void RetrieveTitles() { var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo' or @class='titulo']"); if (titles != null) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_001; foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } } else { titles = page.DocumentNode.SelectNodes("//td[@class='Titulo_Principal']"); if (titles != null) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_004; foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } } } }
/// <summary> /// The only to do here is to display in log and screen those docs that have duplicate Title. /// </summary> /// <param name="templateType">Old Doc template to analyze</param> public void OldDocValidateTemplateDuplicateTitleFixOnly(int templateType) { if (log.IsDebugEnabled) log.Debug("OldDocValidateTemplateDuplicateTitleFixOnly start"); string liner = "Executed OldDocValidateTemplateDuplicateTitleFixOnly() "; if (log.IsWarnEnabled) log.Warn(liner); Console.WriteLine(liner); List<IterwebMapInfo> processedIds = new List<IterwebMapInfo>(); _se4DocList.Clear(); Console.WriteLine("Loading documents from database"); if (log.IsInfoEnabled) { log.Info("Loading documents from database"); } LoadOldDocuments(); liner = "Documents loaded=[" + _se4DocList.Count + "]"; Console.WriteLine(liner); if (log.IsWarnEnabled) log.Warn(liner); foreach (var it in _se4DocList) { if (it.OldDocStatus == MigrateStatusCode.OLD_DOC_STATUS_CODE_SUCCESS) { if (it.OldDocTemplateType != templateType) { continue; } Doc examinedDoc = JsonConvert.DeserializeObject<Doc>(it.JsonContent); if (examinedDoc.Title.Count > 1) { string data = examinedDoc.Title[1].Content; examinedDoc.Title.Clear(); Title tit = new Title(); tit.Name = "Titulo_ITERWEB"; tit.Content = data; examinedDoc.Title.Add(tit); string json = JsonConvert.SerializeObject(examinedDoc); it.JsonContent = json; UpdateOldDoc(it); processedIds.Add(it); } } } liner = "Duplicates set to " + processedIds.Count; Console.WriteLine(liner); if (log.IsWarnEnabled) log.Warn(liner); liner = "Duplicated Title ids[" + processedIds.Select(id => id.IdSitemap).ToList().ToStringDelimited(",") + "]"; Console.WriteLine(liner); if (log.IsWarnEnabled) log.Warn(liner); if (log.IsDebugEnabled) log.Debug("OldDocValidateTemplateDuplicateTitleFixOnly end"); }
/// <summary> /// Tries to match Title, Credit/signature, Image Only and body Content only. /// NOTE: This method is called only if 'examinedDoc' is empty. /// </summary> /// <returns>True if a match is found</returns> private bool GetTemplateTitleImageOnlyBodyContentCreditOnlyContent() { bool rslt = false; var isTitleSet = false; var isImageOnlySet = false; var isBodyContentSet = false; var isCreditSet = false; var titles = page.DocumentNode.SelectNodes("//h1[@id='titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); isTitleSet = true; } } var mediaImage = page.DocumentNode.SelectSingleNode("//div[@class='medioIzquierdaNotaInterior']"); if (mediaImage != null) { var imgRef = mediaImage.FirstChild; if (imgRef != null) { var imgRefSrc = imgRef.Attributes["src"]; if (imgRefSrc != null) { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = imgRefSrc.Value; isImageOnlySet = true; } } } var signatureDiv = page.DocumentNode.SelectSingleNode("//div[@id='firma']"); if (signatureDiv != null) { var node = signatureDiv.FirstChild; if (node != null) { var authorName = node.InnerHtml.Replace("|", "").Trim(); node = node.NextSibling; var cityName = node.InnerHtml.Replace("|", "").Trim(); node = node.NextSibling; var displayDateName = node.InnerHtml.Replace("\r\n", "").Trim(); if (displayDateName == "") { displayDateName = cityName; cityName = authorName; authorName = ""; } examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName; examinedDoc.CreditText.AuthorText = authorName; examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName; examinedDoc.CreditText.CityText = cityName; examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName; examinedDoc.CreditText.DisplayDateText = displayDateName; isCreditSet = true; } } // Body content is stored in a context DIV. var contentTitle = page.DocumentNode.SelectSingleNode("//div[@class='tituloSubseccionPrincipal']"); var contentText = page.DocumentNode.SelectSingleNode("//div[@class='contenidoContexto']"); var extractContentText = ""; if (contentTitle != null && contentText != null) { extractContentText = contentTitle.InnerHtml + contentText.InnerHtml; examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = extractContentText.Trim(); isBodyContentSet = true; } rslt = isTitleSet && isCreditSet && isImageOnlySet && isBodyContentSet; if (rslt) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_013; } else { // Clear any contents found so far as none of all items were found. examinedDoc.Reset(); } return rslt; }
/// <summary> /// Parses document to locate only Title, Credit, Body Content elements only. /// </summary> /// <returns>True if a match is found</returns> private bool GetTemplateTitletBodyContentCreditOnlyContent() { bool rslt = false; var isTitleSet = false; var isCreditSet = false; var isBodyContentSet = false; var titles = page.DocumentNode.SelectNodes("//h1[@id='titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); isTitleSet = true; } } var signatureDiv = page.DocumentNode.SelectSingleNode("//div[@id='firma']"); if (signatureDiv != null) { var node = signatureDiv.FirstChild; if (node.FirstChild != null) { var authorName = node.FirstChild.InnerHtml; node = node.NextSibling; var cityName = node.InnerHtml.Replace("|", "").Trim(); node = node.NextSibling; var displayDateName = node.InnerHtml; examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName; examinedDoc.CreditText.AuthorText = authorName; examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName; examinedDoc.CreditText.CityText = cityName; examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName; examinedDoc.CreditText.DisplayDateText = displayDateName; isCreditSet = true; } } var bodyContent = page.DocumentNode.SelectSingleNode("//div[@id='segundoParrafo']"); if (bodyContent != null) { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = bodyContent.InnerHtml; isBodyContentSet = true; } rslt = isTitleSet && isCreditSet && isBodyContentSet; if (rslt) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_011; } return rslt; }
/// <summary> /// Try to gather a title, a content, a bullet. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitleBulletCreditContent() { bool isTitleSet, isCreditSet, isContentSet, isBulletSet; isTitleSet = isCreditSet = isContentSet = isBulletSet = false; var titles = page.DocumentNode.SelectNodes("//td[@name='tbTituloDoc']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } var selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']"); if (selectBullets != null) { foreach (var bullet in selectBullets) { var node = bullet; if (node.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = node.InnerHtml.Replace("\r\n", "").Trim(); examinedDoc.Bullet.Add(bl); } } isBulletSet = true; } var nodeContent = page.DocumentNode.SelectSingleNode("//td[@class='Cuerpo_texto_nota_interior']"); if (nodeContent != null) { var txt = ""; var nodeContentChildren = nodeContent.ChildNodes; if (nodeContentChildren != null) { foreach (var p in nodeContentChildren) { if (p.Name.ToUpper() == "P") { txt += p.OuterHtml; } } if (txt != null && txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } } var nodeCredit = page.DocumentNode.SelectSingleNode("//table[@class='Credito_Periodista']"); if (nodeCredit != null) { var nodeCreditChildren = nodeCredit.ChildNodes; if (nodeCreditChildren != null && nodeCreditChildren.Count == 1) { var n0 = nodeCreditChildren[0]; var n0Children = n0.ChildNodes; if (n0Children != null && n0Children.Count == 6) { var author = n0Children[1].InnerText; var city = n0Children[3].InnerText; var displayData = n0Children[5].InnerText; if (author != null && city != null && displayData != null) { if (author != "" && city != "" && displayData != "") { examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName; examinedDoc.CreditText.AuthorText = author; examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName; examinedDoc.CreditText.CityText = city; examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName; examinedDoc.CreditText.DisplayDateText = displayData; isCreditSet = true; } } } } } if (isTitleSet && isContentSet && isBulletSet && isCreditSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_009; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Try to gather a title, a content, a bullet, and a side image with a footer. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitleBulletPhotoFooterContent() { bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoFooterSet; isTitleSet = isContentSet = isBulletSet = isLeftPhotoFooterSet = false; var titles = page.DocumentNode.SelectNodes("//div[@class='Titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } HtmlNode documentTextNode = null; var selectBullets = page.DocumentNode.SelectNodes("//p[@class='Balas']/img"); if (selectBullets != null) { foreach (var bullet in selectBullets) { var node = bullet.NextSibling; if (node.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = node.InnerHtml.Replace("\r\n", "").Trim(); examinedDoc.Bullet.Add(bl); } } isBulletSet = true; if (selectBullets.Count != 0) { documentTextNode = selectBullets[selectBullets.Count - 1]; } } if (documentTextNode != null) { documentTextNode = documentTextNode.ParentNode.NextSibling; string txt = ""; while (documentTextNode != null) { txt += documentTextNode.OuterHtml; documentTextNode = documentTextNode.NextSibling; while (documentTextNode != null) { if (documentTextNode.Name.ToUpper() != "P") { documentTextNode = documentTextNode.NextSibling; } if (documentTextNode == null) { break; } if (documentTextNode.Name.ToUpper() == "P") { break; } } } if (txt != null && txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } var images = page.DocumentNode.SelectNodes("//td/img"); var footers = page.DocumentNode.SelectNodes("//td[@class='PieFoto']"); List<String> imageNameList = new List<String>(); if (images != null && footers != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { for (int i = 0; i < imageNameList.Count; i++) { PhotoRelated pr = new PhotoRelated(); pr.ImageName = MetadataName.OldDocImageName; pr.ImageSrc = imageNameList[i]; pr.FooterName = MetadataName.OldDocImageFooterName; pr.Footer = footers[i].InnerHtml; examinedDoc.PhotoRelated.Add(pr); } isLeftPhotoFooterSet = true; } } if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoFooterSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_008; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Try to gather a title, a content, a bullet, and a side image. For template to be considered usable, all these /// items must be present. /// </summary> private void GetTemplateTitlePhotoContent() { bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoSet; isTitleSet = isContentSet = isBulletSet = isLeftPhotoSet = false; var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } isTitleSet = true; } var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']"); if (selectBullets != null) { foreach (var bullet in selectBullets) { if (bullet.InnerHtml != "") { Bullet bl = new Bullet(); bl.Name = MetadataName.OldDocBulletName; bl.Content = bullet.InnerHtml; examinedDoc.Bullet.Add(bl); } } isBulletSet = true; } var images = page.DocumentNode.SelectNodes("//td/img"); List<String> imageNameList = new List<String>(); if (images != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = imageNameList[0]; isLeftPhotoSet = true; } } // We are now about to try to catch the content paragraph. // It happens that there are two paths about it. // We start from Title node. var node = page.DocumentNode.SelectSingleNode("//span[@class='Titulo']"); if (node != null) { node = node.ParentNode.NextSibling; int numBR = 1; while (node != null && numBR <= 2) { node = node.NextSibling; if (node == null) { break; } if (node.Name.ToLower() == "br") { numBR++; if (numBR == 2) { break; } } } if (node != null) { node = node.NextSibling; } if (node != null) { while (node != null) { if (node.Name.ToLower() == "table") { break; } else { if (node.Name.ToLower() == "br") { break; } } node = node.NextSibling; } } if (node != null) { node = node.NextSibling; } var txt = ""; if (node != null) { txt = node.InnerHtml; } if (txt != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txt; isContentSet = true; } } if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoSet) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_007; } else { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; examinedDoc.Reset(); } }
/// <summary> /// Get the Photo Gallery section about the document being examined. /// </summary> private void GetTemplatePhotoGallery() { var titles = page.DocumentNode.SelectNodes("//span[@class='AyudaLector_Titulo']"); if (titles != null) { foreach (var t in titles) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = t.InnerHtml; examinedDoc.Title.Add(tit); } useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005; } // Let's continue evaluating this template compilation only if there is a title set. if (examinedDoc.Title.Count != 0) { var docEntryText = page.DocumentNode.SelectSingleNode("//td[@class='PieFoto']"); if (docEntryText != null) { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = docEntryText.InnerHtml; useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005; } else { docEntryText = page.DocumentNode.SelectSingleNode("//td[@align='left' and @valign='top']"); if (docEntryText != null) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005; examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = docEntryText.InnerHtml; } } var bigPhotoList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='FotoGrande']"); var creditList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='Credito']"); var footerList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='PieFoto']"); if (bigPhotoList != null && creditList != null && footerList != null) { var cntBigPhotoList = bigPhotoList.Count; var cntCreditList = creditList.Count; var cntFooterList = footerList.Count; if (!((cntBigPhotoList == cntCreditList) && (cntBigPhotoList == cntFooterList) && (cntCreditList == cntFooterList))) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED; } else { String[] validImgExtensions = { ".gif", ".jpg" }; for (int i = 0; i < cntBigPhotoList; i++) { var smallPhotoVal = bigPhotoList[i].Attributes["value"]; var creditVal = creditList[i].Attributes["value"]; var footerVal = footerList[i].Attributes["value"]; PhotoGallery pg = new PhotoGallery() { PhotoBigName = MetadataName.OldDocPhotoGalleryPhotoBigName, PhotoBigContent = smallPhotoVal.Value.IncludeInsideToImageName("_g", validImgExtensions).Trim(), PhotoSmallName = MetadataName.OldDocPhotoGalleryPhotoSmallName, PhotoSmallContent = smallPhotoVal.Value.Trim(), PhotoCreditName = MetadataName.OldDocPhotoGalleryPhotoCreditName, PhotoCreditContent = creditVal.Value, PhotoFooterName = MetadataName.OldDocPhotoGalleryPhotoFooterName, PhotoFooterContent = footerVal.Value }; examinedDoc.PhotoGallery.Add(pg); } useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005; } } } }
/// <summary> /// Given the templateTypeCode parameter, it tries to catch other fields necessary for that template type /// before giving up to not use the doc in that template type. /// </summary> /// <param name="html">The raw HTML if needed to extract from it</param> /// <param name="kind">A number which indicates what portion to check to map against 'templateTypeCode'</param> /// <param name="templateTypeCode">Match fields for this template type code.</param> /// <returns>true if mapped document match field in template type code</returns> private bool MapIntoExistingTemplate(string html, int kind, int templateTypeCode) { bool rslt = false; HtmlNode node = null; HtmlNode title = null; HtmlNodeCollection nodes = null; HtmlNodeCollection images = null; bool isTitleSet = false; bool isImageOnlySet = false; bool isBodyContentSet = false; bool firstTR = true; List<String> imageNameList = null; string tableHtml = null; string txt = ""; string txtAll = ""; switch (templateTypeCode) { case TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_010: switch (kind) { case 1: // Title (already set so far) isTitleSet = true; // ImageOnly // NOTE: Image is contained inside a javascript function inside HTML, // that said, it is needed to be extracted by hand. string ss = ""; var pos1 = html.IndexOf("imagenes[1].src"); var pos2 = html.IndexOf("imagenes[2].src"); if (pos1 != -1 && pos2 != -1) { ss = html.Substring(pos1, pos2 - pos1 + 1); if (ss != "") { var extracted = ss.ExtractCharactersUsingDelimiters('"', '"'); if (extracted == "") { extracted = ss.ExtractCharactersUsingDelimiters('\'', '\''); } if (extracted != "") { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = extracted; isImageOnlySet = true; } } } // DocumentText node = page.DocumentNode.SelectSingleNode("//img[@name='secuencia']"); if (node != null) { node = node.NextSibling; examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = node.InnerHtml; isBodyContentSet = true; } rslt = isTitleSet && isImageOnlySet && isBodyContentSet; // If not matched then clean up. if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = ""; } break; case 2: // Let's try Title title = page.DocumentNode.SelectSingleNode("//td[@class='TituloFicha']"); if (title != null) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = title.InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // Let's try ImageOnly images = page.DocumentNode.SelectNodes("//td/img"); imageNameList = new List<String>(); if (images != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = imageNameList[0]; isImageOnlySet = true; } } // Let's try DocumentText (here, DocumentText is two parts, one for a table and second a text. // table is to be emitted unformatted. tableHtml = "<table width='100%' border='0' align='center' cellpadding='1' cellspacing='0'>"; txt = ""; firstTR = true; nodes = page.DocumentNode.SelectNodes("//table[@bgcolor='#202020']/tr"); if (nodes != null) { foreach (var n in nodes) { tableHtml += "<tr>"; if (n.HasChildNodes) { var children = n.ChildNodes; int numTD = 1; foreach (var ntd in children) { if (ntd.Name.ToUpper() == "TD") { if (firstTR) { switch (numTD) { case 1: tableHtml += "<td width='23%'>" + ntd.InnerText + "</td>"; break; case 2: tableHtml += "<td width='77%'>" + ntd.InnerText + "</td>"; break; } numTD++; } else { tableHtml += "<td>" + ntd.InnerText + "</td>"; } } } firstTR = false; } tableHtml += "</tr>"; } tableHtml += "</table>"; } node = page.DocumentNode.SelectSingleNode("//span[@class='Destacado']"); if (node != null) { txt = ""; node = node.NextSibling; while (node != null) { txt += node.InnerHtml; node = node.NextSibling; } } txtAll = ""; if (tableHtml != "" && txt != "") { txtAll = tableHtml + "<p>" + txt + "</p>"; isBodyContentSet = true; } else { if (tableHtml != "") { txtAll = tableHtml; isBodyContentSet = true; } else { txtAll = "<p>" + txt + "</p>"; } } if (isBodyContentSet) { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } rslt = isTitleSet && isImageOnlySet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = ""; examinedDoc.Title.Clear(); } break; case 3: // Title var titles = page.DocumentNode.SelectNodes("//span[@class='TituloSecundario']"); nodes = titles; if (titles != null && titles.Count >= 2) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = titles[1].InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // Let's try ImageOnly images = page.DocumentNode.SelectNodes("//td/img"); imageNameList = new List<String>(); if (images != null) { imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList.Count != 0) { examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName; examinedDoc.ImageOnlySet.Content = imageNameList[0]; isImageOnlySet = true; } } // DocumentText if (nodes != null) { node = nodes[0].NextSibling; txtAll = ""; while (node != null) { txtAll += node.InnerHtml; node = node.NextSibling; } if (txtAll != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } } rslt = isTitleSet && isImageOnlySet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = ""; examinedDoc.Title.Clear(); } break; } break; case TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_012: switch (kind) { case 1: // Let's try Title title = page.DocumentNode.SelectSingleNode("//td[@class='TituloFicha']"); if (title != null) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = title.InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // Let's try DocumentText (here, DocumentText is two parts, one for a table and second a text. // table is to be emitted unformatted. tableHtml = "<table width='100%' border='0' align='center' cellpadding='1' cellspacing='0'>"; txt = ""; firstTR = true; nodes = page.DocumentNode.SelectNodes("//table[@bgcolor='#202020']/tr"); if (nodes != null) { foreach (var n in nodes) { tableHtml += "<tr>"; if (n.HasChildNodes) { var children = n.ChildNodes; int numTD = 1; foreach (var ntd in children) { if (ntd.Name.ToUpper() == "TD") { if (firstTR) { switch (numTD) { case 1: tableHtml += "<td width='23%'>" + ntd.InnerText + "</td>"; break; case 2: tableHtml += "<td width='77%'>" + ntd.InnerText + "</td>"; break; } numTD++; } else { tableHtml += "<td>" + ntd.InnerText + "</td>"; } } } firstTR = false; } tableHtml += "</tr>"; } tableHtml += "</table>"; } node = page.DocumentNode.SelectSingleNode("//span[@class='Destacado']"); if (node != null) { txt = ""; node = node.NextSibling; while (node != null) { txt += node.InnerHtml; node = node.NextSibling; } } txtAll = ""; if (tableHtml != "" && txt != "") { txtAll = tableHtml + "<p>" + txt + "</p>"; isBodyContentSet = true; } else { if (tableHtml != "") { txtAll = tableHtml; isBodyContentSet = true; } else { txtAll = "<p>" + txt + "</p>"; } } if (isBodyContentSet) { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } rslt = isTitleSet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.Title.Clear(); } break; case 2: // Title title = page.DocumentNode.SelectSingleNode("//span[@class='TituloPrincipal']"); if (title != null) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = title.InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // DocumentText if (title != null) { node = title.NextSibling; } else { node = null; } txtAll = ""; while (node != null) { if (node.Name == "#text") { txtAll += node.InnerText; } node = node.NextSibling; } if (txtAll != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } rslt = isTitleSet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.Title.Clear(); } break; case 3: // Title node = page.DocumentNode.SelectSingleNode("//td[@class='Vitrinas-Vineta']"); if (node != null) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = node.InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // DocumentText txtAll = ""; node = page.DocumentNode.SelectSingleNode("//td[@class='VitrinaTexto']"); if (node != null) { if (node.HasChildNodes) { node = node.FirstChild; while (node != null) { txtAll += node.InnerHtml; node = node.NextSibling; } if (txtAll != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } } } rslt = isTitleSet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.Title.Clear(); } break; case 4: // Title var titles = page.DocumentNode.SelectNodes("//span[@class='TituloSecundario']"); nodes = titles; if (titles != null && titles.Count >= 2) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = titles[1].InnerText; examinedDoc.Title.Add(tit); isTitleSet = true; } // DocumentText if (nodes != null) { node = nodes[0].NextSibling; txtAll = ""; while (node != null) { txtAll += node.InnerHtml; node = node.NextSibling; } if (txtAll != "") { examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = txtAll; isBodyContentSet = true; } } rslt = isTitleSet && isBodyContentSet; if (!rslt) { examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = ""; examinedDoc.Title.Clear(); } break; } break; } if (rslt) { examinedDoc.TemplateType = templateTypeCode; examinedDoc.Status = MigrateStatusCode.OLD_DOC_STATUS_CODE_SUCCESS; } return rslt; }
/// <summary> /// Tries to match Title, body Content only and a photo gallery with one footer in it. /// NOTE: This method is called only if 'examinedDoc' is empty. /// </summary> /// <returns>True if a match is found</returns> private bool GetTemplateTitleBodyContentPhotoFooterGalleryContent() { bool rslt = false; var isTitleSet = false; var isBodyContentSet = false; var isPhotoFooterGallerySet = false; var contentTextFirst = ""; var contentTextSecond = ""; var titles = page.DocumentNode.SelectNodes("//span[@class='TituloPrincipal']"); if (titles != null) { Title tit = new Title(); tit.Name = MetadataName.OldDocTitleName; tit.Content = titles[0].InnerHtml; examinedDoc.Title.Add(tit); isTitleSet = true; } var images = page.DocumentNode.SelectNodes("//img"); if (images != null) { var imageNameList = new List<string>(); foreach (var img in images) { var imgSrcRef = img.Attributes["src"]; if (imgSrcRef != null) { var s = imgSrcRef.Value; if (s.Contains("BancoMedios/Imagenes")) { imageNameList.Add(s); } } } if (imageNameList != null) { foreach (var img in imageNameList) { PhotoOnly pho = new PhotoOnly(); pho.ImageName = MetadataName.OldDocImageName; pho.ImageSrc = img; examinedDoc.PhotoFooterGallery.PhotoList.Add(pho); } } } var footerText = ""; if (titles != null) { if (titles.Count >= 2) { examinedDoc.PhotoFooterGallery.FooterTitleName = MetadataName.OldDocPhotoOnlyGalleryImageFooterTitleName; examinedDoc.PhotoFooterGallery.FoooterTitle = titles[1].InnerHtml.Trim(); var node = titles[1]; while (node != null) { if (node.Name.ToUpper() == "P") { break; } node = node.NextSibling; } if (node != null) { node = node.NextSibling; while (node != null) { footerText += node.InnerHtml.Replace("\r\n", "").Trim(); node = node.NextSibling; } if (footerText != "") { examinedDoc.PhotoFooterGallery.FooterName = MetadataName.OldDocPhotoOnlyGalleryImageFooterName; examinedDoc.PhotoFooterGallery.Footer = footerText; } } } } isPhotoFooterGallerySet = !examinedDoc.PhotoFooterGallery.IsEmpty(); // Let's retrieve Body content text. // In fact for this template, it is layout in two different parts. if (titles != null) { // First part if (titles.Count >= 2) { var node = titles[0]; while (node != null) { if (node.Name.ToUpper() == "P") { break; } node = node.NextSibling; } if (node != null) { contentTextFirst = node.InnerHtml; } } } // Second Part retrieval. var nodeText = page.DocumentNode.SelectSingleNode("//p[@align='left']"); if (nodeText != null) { contentTextSecond = nodeText.InnerHtml; } if (contentTextFirst != "" && contentTextSecond != "") { var s = contentTextFirst + contentTextSecond; examinedDoc.DocumentText.Name = MetadataName.OldDocTextName; examinedDoc.DocumentText.Content = s.Trim(); isBodyContentSet = true; } rslt = isTitleSet && isPhotoFooterGallerySet && isBodyContentSet; if (rslt) { useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_015; } else { // Clear any contents found so far as none of all items were found. examinedDoc.Reset(); } return rslt; }