Exemplo n.º 1
0
        /// <summary>
        /// Try to gather a title, a content, a bullet. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitleBulletCreditContent() {
            bool isTitleSet, isCreditSet, isContentSet, isBulletSet;
            isTitleSet = isCreditSet = isContentSet = isBulletSet = false;

            var titles = page.DocumentNode.SelectNodes("//td[@name='tbTituloDoc']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }

            var selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    var node = bullet;
                    if (node.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = node.InnerHtml.Replace("\r\n", "").Trim();
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
            }

            var nodeContent = page.DocumentNode.SelectSingleNode("//td[@class='Cuerpo_texto_nota_interior']");
            if (nodeContent != null) {
                var txt = "";
                var nodeContentChildren = nodeContent.ChildNodes;
                if (nodeContentChildren != null) {
                    foreach (var p in nodeContentChildren) {
                        if (p.Name.ToUpper() == "P") {
                            txt += p.OuterHtml;
                        }
                    }
                    if (txt != null && txt != "") {
                        examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                        examinedDoc.DocumentText.Content = txt;
                        isContentSet = true;
                    }
                }
            }
            var nodeCredit = page.DocumentNode.SelectSingleNode("//table[@class='Credito_Periodista']");
            if (nodeCredit != null) {
                var nodeCreditChildren = nodeCredit.ChildNodes;
                if (nodeCreditChildren != null && nodeCreditChildren.Count == 1) {
                    var n0 = nodeCreditChildren[0];
                    var n0Children = n0.ChildNodes;
                    if (n0Children != null && n0Children.Count == 6) {
                        var author = n0Children[1].InnerText;
                        var city = n0Children[3].InnerText;
                        var displayData = n0Children[5].InnerText;

                        if (author != null && city != null && displayData != null) {
                            if (author != "" && city != "" && displayData != "") {
                                examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName;
                                examinedDoc.CreditText.AuthorText = author;
                                examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName;
                                examinedDoc.CreditText.CityText = city;
                                examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName;
                                examinedDoc.CreditText.DisplayDateText = displayData;
                                isCreditSet = true;
                            }
                        }
                    }
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isCreditSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_009;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Try to gather a title, a content, a bullet, and a side image. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitlePhotoContent() {
            bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoSet;
            isTitleSet = isContentSet = isBulletSet = isLeftPhotoSet = false;

            var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }
            var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    if (bullet.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = bullet.InnerHtml;
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
            }
            var images = page.DocumentNode.SelectNodes("//td/img");
            List<String> imageNameList = new List<String>();
            if (images != null) {
                imageNameList = new List<string>();
                foreach (var img in images) {
                    var imgSrcRef = img.Attributes["src"];
                    if (imgSrcRef != null) {
                        var s = imgSrcRef.Value;
                        if (s.Contains("BancoMedios/Imagenes")) {
                            imageNameList.Add(s);
                        }
                    }
                }
                if (imageNameList.Count != 0) {
                    examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                    examinedDoc.ImageOnlySet.Content = imageNameList[0];
                    isLeftPhotoSet = true;
                }
            }

            // We are now about to try to catch the content paragraph.
            // It happens that there are two paths about it.
            // We start from Title node.
            var node = page.DocumentNode.SelectSingleNode("//span[@class='Titulo']");
            if (node != null) {
                node = node.ParentNode.NextSibling;

                int numBR = 1;
                while (node != null && numBR <= 2) {
                    node = node.NextSibling;
                    if (node == null) {
                        break;
                    }
                    if (node.Name.ToLower() == "br") {
                        numBR++;
                        if (numBR == 2) {
                            break;
                        }
                    }
                }
                if (node != null) {
                    node = node.NextSibling;
                }
                if (node != null) {
                    while (node != null) {
                        if (node.Name.ToLower() == "table") {
                            break;
                        }
                        else {
                            if (node.Name.ToLower() == "br") {
                                break;
                            }
                        }
                        node = node.NextSibling;
                    }
                }
                if (node != null) {
                    node = node.NextSibling;
                }

                var txt = "";
                if (node != null) {
                    txt = node.InnerHtml;
                }

                if (txt != "") {
                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                    examinedDoc.DocumentText.Content = txt;
                    isContentSet = true;
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_007;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Try to gather a title, a content, a bullet, and a side image with a footer. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitleBulletPhotoFooterContent() {
            bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoFooterSet;
            isTitleSet = isContentSet = isBulletSet = isLeftPhotoFooterSet = false;

            var titles = page.DocumentNode.SelectNodes("//div[@class='Titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }
            HtmlNode documentTextNode = null;
            var selectBullets = page.DocumentNode.SelectNodes("//p[@class='Balas']/img");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    var node = bullet.NextSibling;
                    if (node.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = node.InnerHtml.Replace("\r\n", "").Trim();
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
                if (selectBullets.Count != 0) {
                    documentTextNode = selectBullets[selectBullets.Count - 1];
                }
            }
            if (documentTextNode != null) {
                documentTextNode = documentTextNode.ParentNode.NextSibling;
                string txt = "";
                while (documentTextNode != null) {
                    txt += documentTextNode.OuterHtml;
                    documentTextNode = documentTextNode.NextSibling;
                    while (documentTextNode != null) {
                        if (documentTextNode.Name.ToUpper() != "P") {
                            documentTextNode = documentTextNode.NextSibling;
                        }
                        if (documentTextNode == null) {
                            break;
                        }
                        if (documentTextNode.Name.ToUpper() == "P") {
                            break;
                        }
                    }
                }
                if (txt != null && txt != "") {
                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                    examinedDoc.DocumentText.Content = txt;
                    isContentSet = true;
                }
            }

            var images = page.DocumentNode.SelectNodes("//td/img");
            var footers = page.DocumentNode.SelectNodes("//td[@class='PieFoto']");
            List<String> imageNameList = new List<String>();
            if (images != null && footers != null) {
                imageNameList = new List<string>();
                foreach (var img in images) {
                    var imgSrcRef = img.Attributes["src"];
                    if (imgSrcRef != null) {
                        var s = imgSrcRef.Value;
                        if (s.Contains("BancoMedios/Imagenes")) {
                            imageNameList.Add(s);
                        }
                    }
                }
                if (imageNameList.Count != 0) {
                    for (int i = 0; i < imageNameList.Count; i++) {
                        PhotoRelated pr = new PhotoRelated();
                        pr.ImageName = MetadataName.OldDocImageName;
                        pr.ImageSrc = imageNameList[i];
                        pr.FooterName = MetadataName.OldDocImageFooterName;
                        pr.Footer = footers[i].InnerHtml;
                        examinedDoc.PhotoRelated.Add(pr);
                    }
                    isLeftPhotoFooterSet = true;
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoFooterSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_008;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
Exemplo n.º 4
0
 /// <summary>
 /// Get document bullets
 /// </summary>
 private void RetrieveBullets() {
     var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']");
     if (selectBullets != null) {
         useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_001;
         foreach (var bullet in selectBullets) {
             if (bullet.InnerHtml != "") {
                 Bullet bl = new Bullet();
                 bl.Name = MetadataName.OldDocBulletName;
                 bl.Content = bullet.InnerHtml;
                 examinedDoc.Bullet.Add(bl);
             }
         }
     }
     else {
         selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']");
         if (selectBullets != null) {
             useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_004;
             foreach (var bullet in selectBullets) {
                 if (bullet.InnerHtml != "") {
                     Bullet bl = new Bullet();
                     bl.Name = MetadataName.OldDocBulletName;
                     bl.Content = bullet.InnerHtml;
                     examinedDoc.Bullet.Add(bl);
                 }
             }
         }
     }
 }