コード例 #1
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
 /// <summary>
 /// Get page titles
 /// </summary>
 private void RetrieveTitles() {
     var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo' or @class='titulo']");
     if (titles != null) {
         useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_001;
         foreach (var t in titles) {
             Title tit = new Title();
             tit.Name = MetadataName.OldDocTitleName;
             tit.Content = t.InnerHtml;
             examinedDoc.Title.Add(tit);
         }
     }
     else {
         titles = page.DocumentNode.SelectNodes("//td[@class='Titulo_Principal']");
         if (titles != null) {
             useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_004;
             foreach (var t in titles) {
                 Title tit = new Title();
                 tit.Name = MetadataName.OldDocTitleName;
                 tit.Content = t.InnerHtml;
                 examinedDoc.Title.Add(tit);
             }
         }
     }
 }
コード例 #2
0
ファイル: SE4DocMigrate.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// The only to do here is to display in log and screen those docs that have duplicate Title.
        /// </summary>
        /// <param name="templateType">Old Doc template to analyze</param>
        public void OldDocValidateTemplateDuplicateTitleFixOnly(int templateType) {
            if (log.IsDebugEnabled) log.Debug("OldDocValidateTemplateDuplicateTitleFixOnly start");

            string liner = "Executed OldDocValidateTemplateDuplicateTitleFixOnly() ";
            if (log.IsWarnEnabled) log.Warn(liner);
            Console.WriteLine(liner);

            List<IterwebMapInfo> processedIds = new List<IterwebMapInfo>();
            _se4DocList.Clear();
            Console.WriteLine("Loading documents from database");
            if (log.IsInfoEnabled) {
                log.Info("Loading documents from database");
            }
            LoadOldDocuments();

            liner = "Documents loaded=[" + _se4DocList.Count + "]";
            Console.WriteLine(liner);
            if (log.IsWarnEnabled) log.Warn(liner);
            foreach (var it in _se4DocList) {
                if (it.OldDocStatus == MigrateStatusCode.OLD_DOC_STATUS_CODE_SUCCESS) {
                    if (it.OldDocTemplateType != templateType) {
                        continue;
                    }
                    Doc examinedDoc = JsonConvert.DeserializeObject<Doc>(it.JsonContent);
                    if (examinedDoc.Title.Count > 1) {
                        string data = examinedDoc.Title[1].Content;
                        examinedDoc.Title.Clear();

                        Title tit = new Title();
                        tit.Name = "Titulo_ITERWEB";
                        tit.Content = data;
                        examinedDoc.Title.Add(tit);

                        string json = JsonConvert.SerializeObject(examinedDoc);

                        it.JsonContent = json;
                        UpdateOldDoc(it);
                        processedIds.Add(it);
                    }
                }
            }
            liner = "Duplicates set to " + processedIds.Count;
            Console.WriteLine(liner);
            if (log.IsWarnEnabled) log.Warn(liner);
            liner = "Duplicated Title ids[" + processedIds.Select(id => id.IdSitemap).ToList().ToStringDelimited(",") + "]";
            Console.WriteLine(liner);
            if (log.IsWarnEnabled) log.Warn(liner);
            if (log.IsDebugEnabled) log.Debug("OldDocValidateTemplateDuplicateTitleFixOnly end");
        }
コード例 #3
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Tries to match Title, Credit/signature, Image Only and body Content only.
        /// NOTE: This method is called only if 'examinedDoc' is empty.
        /// </summary>
        /// <returns>True if a match is found</returns>
        private bool GetTemplateTitleImageOnlyBodyContentCreditOnlyContent() {
            bool rslt = false;
            var isTitleSet = false;
            var isImageOnlySet = false;
            var isBodyContentSet = false;
            var isCreditSet = false;

            var titles = page.DocumentNode.SelectNodes("//h1[@id='titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                    isTitleSet = true;
                }
            }

            var mediaImage = page.DocumentNode.SelectSingleNode("//div[@class='medioIzquierdaNotaInterior']");
            if (mediaImage != null) {
                var imgRef = mediaImage.FirstChild;
                if (imgRef != null) {
                    var imgRefSrc = imgRef.Attributes["src"];
                    if (imgRefSrc != null) {
                        examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                        examinedDoc.ImageOnlySet.Content = imgRefSrc.Value;
                        isImageOnlySet = true;
                    }
                }
            }
            var signatureDiv = page.DocumentNode.SelectSingleNode("//div[@id='firma']");
            if (signatureDiv != null) {
                var node = signatureDiv.FirstChild;

                if (node != null) {
                    var authorName = node.InnerHtml.Replace("|", "").Trim();

                    node = node.NextSibling;
                    var cityName = node.InnerHtml.Replace("|", "").Trim();

                    node = node.NextSibling;
                    var displayDateName = node.InnerHtml.Replace("\r\n", "").Trim();

                    if (displayDateName == "") {
                        displayDateName = cityName;
                        cityName = authorName;
                        authorName = "";
                    }

                    examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName;
                    examinedDoc.CreditText.AuthorText = authorName;
                    examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName;
                    examinedDoc.CreditText.CityText = cityName;
                    examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName;
                    examinedDoc.CreditText.DisplayDateText = displayDateName;
                    isCreditSet = true;
                }
            }

            // Body content is stored in a context DIV.
            var contentTitle = page.DocumentNode.SelectSingleNode("//div[@class='tituloSubseccionPrincipal']");
            var contentText = page.DocumentNode.SelectSingleNode("//div[@class='contenidoContexto']");
            var extractContentText = "";
            if (contentTitle != null && contentText != null) {
                extractContentText = contentTitle.InnerHtml + contentText.InnerHtml;
                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                examinedDoc.DocumentText.Content = extractContentText.Trim();
                isBodyContentSet = true;
            }

            rslt = isTitleSet && isCreditSet && isImageOnlySet && isBodyContentSet;
            if (rslt) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_013;
            }
            else {
                // Clear any contents found so far as none of all items were found.
                examinedDoc.Reset();
            }
            return rslt;
        }
コード例 #4
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Parses document to locate only Title, Credit, Body Content elements only.
        /// </summary>
        /// <returns>True if a match is found</returns>
        private bool GetTemplateTitletBodyContentCreditOnlyContent() {
            bool rslt = false;
            var isTitleSet = false;
            var isCreditSet = false;
            var isBodyContentSet = false;

            var titles = page.DocumentNode.SelectNodes("//h1[@id='titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                    isTitleSet = true;
                }
            }

            var signatureDiv = page.DocumentNode.SelectSingleNode("//div[@id='firma']");
            if (signatureDiv != null) {
                var node = signatureDiv.FirstChild;

                if (node.FirstChild != null) {
                    var authorName = node.FirstChild.InnerHtml;

                    node = node.NextSibling;
                    var cityName = node.InnerHtml.Replace("|", "").Trim();

                    node = node.NextSibling;
                    var displayDateName = node.InnerHtml;

                    examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName;
                    examinedDoc.CreditText.AuthorText = authorName;
                    examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName;
                    examinedDoc.CreditText.CityText = cityName;
                    examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName;
                    examinedDoc.CreditText.DisplayDateText = displayDateName;
                    isCreditSet = true;
                }
            }

            var bodyContent = page.DocumentNode.SelectSingleNode("//div[@id='segundoParrafo']");
            if (bodyContent != null) {
                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                examinedDoc.DocumentText.Content = bodyContent.InnerHtml;
                isBodyContentSet = true;
            }

            rslt = isTitleSet && isCreditSet && isBodyContentSet;
            if (rslt) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_011;
            }

            return rslt;
        }
コード例 #5
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Try to gather a title, a content, a bullet. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitleBulletCreditContent() {
            bool isTitleSet, isCreditSet, isContentSet, isBulletSet;
            isTitleSet = isCreditSet = isContentSet = isBulletSet = false;

            var titles = page.DocumentNode.SelectNodes("//td[@name='tbTituloDoc']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }

            var selectBullets = page.DocumentNode.SelectNodes("//td[@class='Subtitulo_Bala']");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    var node = bullet;
                    if (node.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = node.InnerHtml.Replace("\r\n", "").Trim();
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
            }

            var nodeContent = page.DocumentNode.SelectSingleNode("//td[@class='Cuerpo_texto_nota_interior']");
            if (nodeContent != null) {
                var txt = "";
                var nodeContentChildren = nodeContent.ChildNodes;
                if (nodeContentChildren != null) {
                    foreach (var p in nodeContentChildren) {
                        if (p.Name.ToUpper() == "P") {
                            txt += p.OuterHtml;
                        }
                    }
                    if (txt != null && txt != "") {
                        examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                        examinedDoc.DocumentText.Content = txt;
                        isContentSet = true;
                    }
                }
            }
            var nodeCredit = page.DocumentNode.SelectSingleNode("//table[@class='Credito_Periodista']");
            if (nodeCredit != null) {
                var nodeCreditChildren = nodeCredit.ChildNodes;
                if (nodeCreditChildren != null && nodeCreditChildren.Count == 1) {
                    var n0 = nodeCreditChildren[0];
                    var n0Children = n0.ChildNodes;
                    if (n0Children != null && n0Children.Count == 6) {
                        var author = n0Children[1].InnerText;
                        var city = n0Children[3].InnerText;
                        var displayData = n0Children[5].InnerText;

                        if (author != null && city != null && displayData != null) {
                            if (author != "" && city != "" && displayData != "") {
                                examinedDoc.CreditText.AuthorName = MetadataName.OldDocCreditAuthorName;
                                examinedDoc.CreditText.AuthorText = author;
                                examinedDoc.CreditText.CityName = MetadataName.OldDocCreditCityName;
                                examinedDoc.CreditText.CityText = city;
                                examinedDoc.CreditText.DisplayDateName = MetadataName.OldDocCreditDisplayDateName;
                                examinedDoc.CreditText.DisplayDateText = displayData;
                                isCreditSet = true;
                            }
                        }
                    }
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isCreditSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_009;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
コード例 #6
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Try to gather a title, a content, a bullet, and a side image with a footer. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitleBulletPhotoFooterContent() {
            bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoFooterSet;
            isTitleSet = isContentSet = isBulletSet = isLeftPhotoFooterSet = false;

            var titles = page.DocumentNode.SelectNodes("//div[@class='Titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }
            HtmlNode documentTextNode = null;
            var selectBullets = page.DocumentNode.SelectNodes("//p[@class='Balas']/img");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    var node = bullet.NextSibling;
                    if (node.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = node.InnerHtml.Replace("\r\n", "").Trim();
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
                if (selectBullets.Count != 0) {
                    documentTextNode = selectBullets[selectBullets.Count - 1];
                }
            }
            if (documentTextNode != null) {
                documentTextNode = documentTextNode.ParentNode.NextSibling;
                string txt = "";
                while (documentTextNode != null) {
                    txt += documentTextNode.OuterHtml;
                    documentTextNode = documentTextNode.NextSibling;
                    while (documentTextNode != null) {
                        if (documentTextNode.Name.ToUpper() != "P") {
                            documentTextNode = documentTextNode.NextSibling;
                        }
                        if (documentTextNode == null) {
                            break;
                        }
                        if (documentTextNode.Name.ToUpper() == "P") {
                            break;
                        }
                    }
                }
                if (txt != null && txt != "") {
                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                    examinedDoc.DocumentText.Content = txt;
                    isContentSet = true;
                }
            }

            var images = page.DocumentNode.SelectNodes("//td/img");
            var footers = page.DocumentNode.SelectNodes("//td[@class='PieFoto']");
            List<String> imageNameList = new List<String>();
            if (images != null && footers != null) {
                imageNameList = new List<string>();
                foreach (var img in images) {
                    var imgSrcRef = img.Attributes["src"];
                    if (imgSrcRef != null) {
                        var s = imgSrcRef.Value;
                        if (s.Contains("BancoMedios/Imagenes")) {
                            imageNameList.Add(s);
                        }
                    }
                }
                if (imageNameList.Count != 0) {
                    for (int i = 0; i < imageNameList.Count; i++) {
                        PhotoRelated pr = new PhotoRelated();
                        pr.ImageName = MetadataName.OldDocImageName;
                        pr.ImageSrc = imageNameList[i];
                        pr.FooterName = MetadataName.OldDocImageFooterName;
                        pr.Footer = footers[i].InnerHtml;
                        examinedDoc.PhotoRelated.Add(pr);
                    }
                    isLeftPhotoFooterSet = true;
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoFooterSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_008;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
コード例 #7
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Try to gather a title, a content, a bullet, and a side image. For template to be considered usable, all these
        /// items must be present.
        /// </summary>
        private void GetTemplateTitlePhotoContent() {
            bool isTitleSet, isContentSet, isBulletSet, isLeftPhotoSet;
            isTitleSet = isContentSet = isBulletSet = isLeftPhotoSet = false;

            var titles = page.DocumentNode.SelectNodes("//span[@class='Titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                isTitleSet = true;
            }
            var selectBullets = page.DocumentNode.SelectNodes("//span[@class='Balas']");
            if (selectBullets != null) {
                foreach (var bullet in selectBullets) {
                    if (bullet.InnerHtml != "") {
                        Bullet bl = new Bullet();
                        bl.Name = MetadataName.OldDocBulletName;
                        bl.Content = bullet.InnerHtml;
                        examinedDoc.Bullet.Add(bl);
                    }
                }
                isBulletSet = true;
            }
            var images = page.DocumentNode.SelectNodes("//td/img");
            List<String> imageNameList = new List<String>();
            if (images != null) {
                imageNameList = new List<string>();
                foreach (var img in images) {
                    var imgSrcRef = img.Attributes["src"];
                    if (imgSrcRef != null) {
                        var s = imgSrcRef.Value;
                        if (s.Contains("BancoMedios/Imagenes")) {
                            imageNameList.Add(s);
                        }
                    }
                }
                if (imageNameList.Count != 0) {
                    examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                    examinedDoc.ImageOnlySet.Content = imageNameList[0];
                    isLeftPhotoSet = true;
                }
            }

            // We are now about to try to catch the content paragraph.
            // It happens that there are two paths about it.
            // We start from Title node.
            var node = page.DocumentNode.SelectSingleNode("//span[@class='Titulo']");
            if (node != null) {
                node = node.ParentNode.NextSibling;

                int numBR = 1;
                while (node != null && numBR <= 2) {
                    node = node.NextSibling;
                    if (node == null) {
                        break;
                    }
                    if (node.Name.ToLower() == "br") {
                        numBR++;
                        if (numBR == 2) {
                            break;
                        }
                    }
                }
                if (node != null) {
                    node = node.NextSibling;
                }
                if (node != null) {
                    while (node != null) {
                        if (node.Name.ToLower() == "table") {
                            break;
                        }
                        else {
                            if (node.Name.ToLower() == "br") {
                                break;
                            }
                        }
                        node = node.NextSibling;
                    }
                }
                if (node != null) {
                    node = node.NextSibling;
                }

                var txt = "";
                if (node != null) {
                    txt = node.InnerHtml;
                }

                if (txt != "") {
                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                    examinedDoc.DocumentText.Content = txt;
                    isContentSet = true;
                }
            }
            if (isTitleSet && isContentSet && isBulletSet && isLeftPhotoSet) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_007;
            }
            else {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                examinedDoc.Reset();
            }
        }
コード例 #8
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Get the Photo Gallery section about the document being examined.
        /// </summary>
        private void GetTemplatePhotoGallery() {
            var titles = page.DocumentNode.SelectNodes("//span[@class='AyudaLector_Titulo']");
            if (titles != null) {
                foreach (var t in titles) {
                    Title tit = new Title();
                    tit.Name = MetadataName.OldDocTitleName;
                    tit.Content = t.InnerHtml;
                    examinedDoc.Title.Add(tit);
                }
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005;
            }

            // Let's continue evaluating this template compilation only if there is a title set.
            if (examinedDoc.Title.Count != 0) {
                var docEntryText = page.DocumentNode.SelectSingleNode("//td[@class='PieFoto']");
                if (docEntryText != null) {
                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                    examinedDoc.DocumentText.Content = docEntryText.InnerHtml;
                    useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005;
                }
                else {
                    docEntryText = page.DocumentNode.SelectSingleNode("//td[@align='left' and @valign='top']");
                    if (docEntryText != null) {
                        useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005;
                        examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                        examinedDoc.DocumentText.Content = docEntryText.InnerHtml;
                    }
                }

                var bigPhotoList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='FotoGrande']");
                var creditList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='Credito']");
                var footerList = page.DocumentNode.SelectNodes("//input[@type='hidden' and @name='PieFoto']");
                if (bigPhotoList != null && creditList != null && footerList != null) {
                    var cntBigPhotoList = bigPhotoList.Count;
                    var cntCreditList = creditList.Count;
                    var cntFooterList = footerList.Count;

                    if (!((cntBigPhotoList == cntCreditList) && (cntBigPhotoList == cntFooterList) && (cntCreditList == cntFooterList))) {
                        useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_UNASSIGNED;
                    }
                    else {
                        String[] validImgExtensions = { ".gif", ".jpg" };
                        for (int i = 0; i < cntBigPhotoList; i++) {
                            var smallPhotoVal = bigPhotoList[i].Attributes["value"];
                            var creditVal = creditList[i].Attributes["value"];
                            var footerVal = footerList[i].Attributes["value"];
                            PhotoGallery pg = new PhotoGallery() {
                                PhotoBigName = MetadataName.OldDocPhotoGalleryPhotoBigName, PhotoBigContent = smallPhotoVal.Value.IncludeInsideToImageName("_g", validImgExtensions).Trim(),
                                PhotoSmallName = MetadataName.OldDocPhotoGalleryPhotoSmallName, PhotoSmallContent = smallPhotoVal.Value.Trim(),
                                PhotoCreditName = MetadataName.OldDocPhotoGalleryPhotoCreditName, PhotoCreditContent = creditVal.Value,
                                PhotoFooterName = MetadataName.OldDocPhotoGalleryPhotoFooterName, PhotoFooterContent = footerVal.Value
                            };
                            examinedDoc.PhotoGallery.Add(pg);
                        }
                        useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_005;
                    }
                }
            }
        }
コード例 #9
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Given the templateTypeCode parameter, it tries to catch other fields necessary for that template type
        /// before giving up to not use the doc in that template type.
        /// </summary>
        /// <param name="html">The raw HTML if needed to extract from it</param>
        /// <param name="kind">A number which indicates what portion to check to map against 'templateTypeCode'</param>
        /// <param name="templateTypeCode">Match fields for this template type code.</param>
        /// <returns>true if mapped document match field in template type code</returns>
        private bool MapIntoExistingTemplate(string html, int kind, int templateTypeCode) {
            bool rslt = false;
            HtmlNode node = null;
            HtmlNode title = null;
            HtmlNodeCollection nodes = null;
            HtmlNodeCollection images = null;
            bool isTitleSet = false;
            bool isImageOnlySet = false;
            bool isBodyContentSet = false;
            bool firstTR = true;
            List<String> imageNameList = null;
            string tableHtml = null;
            string txt = "";
            string txtAll = "";

            switch (templateTypeCode) {
                case TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_010:
                    switch (kind) {
                        case 1:
                            // Title (already set so far)
                            isTitleSet = true;

                            // ImageOnly
                            // NOTE: Image is contained inside a javascript function inside HTML, 
                            // that said, it is needed to be extracted by hand.
                            string ss = "";
                            var pos1 = html.IndexOf("imagenes[1].src");
                            var pos2 = html.IndexOf("imagenes[2].src");

                            if (pos1 != -1 && pos2 != -1) {
                                ss = html.Substring(pos1, pos2 - pos1 + 1);
                                if (ss != "") {
                                    var extracted = ss.ExtractCharactersUsingDelimiters('"', '"');
                                    if (extracted == "") {
                                        extracted = ss.ExtractCharactersUsingDelimiters('\'', '\'');
                                    }
                                    if (extracted != "") {
                                        examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                                        examinedDoc.ImageOnlySet.Content = extracted;
                                        isImageOnlySet = true;
                                    }
                                }
                            }

                            // DocumentText
                            node = page.DocumentNode.SelectSingleNode("//img[@name='secuencia']");
                            if (node != null) {
                                node = node.NextSibling;
                                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                examinedDoc.DocumentText.Content = node.InnerHtml;
                                isBodyContentSet = true;
                            }
                            rslt = isTitleSet && isImageOnlySet && isBodyContentSet;

                            // If not matched then clean up.
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = "";
                            }
                            break;
                        case 2:
                            // Let's try Title
                            title = page.DocumentNode.SelectSingleNode("//td[@class='TituloFicha']");
                            if (title != null) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = title.InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // Let's try ImageOnly
                            images = page.DocumentNode.SelectNodes("//td/img");
                            imageNameList = new List<String>();
                            if (images != null) {
                                imageNameList = new List<string>();
                                foreach (var img in images) {
                                    var imgSrcRef = img.Attributes["src"];
                                    if (imgSrcRef != null) {
                                        var s = imgSrcRef.Value;
                                        if (s.Contains("BancoMedios/Imagenes")) {
                                            imageNameList.Add(s);
                                        }
                                    }
                                }
                                if (imageNameList.Count != 0) {
                                    examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                                    examinedDoc.ImageOnlySet.Content = imageNameList[0];
                                    isImageOnlySet = true;
                                }
                            }

                            // Let's try DocumentText (here, DocumentText is two parts, one for a table and second a text.
                            // table is to be emitted unformatted.
                            tableHtml = "<table width='100%' border='0' align='center' cellpadding='1' cellspacing='0'>";
                            txt = "";
                            firstTR = true;
                            nodes = page.DocumentNode.SelectNodes("//table[@bgcolor='#202020']/tr");
                            if (nodes != null) {
                                foreach (var n in nodes) {
                                    tableHtml += "<tr>";
                                    if (n.HasChildNodes) {
                                        var children = n.ChildNodes;
                                        int numTD = 1;
                                        foreach (var ntd in children) {
                                            if (ntd.Name.ToUpper() == "TD") {
                                                if (firstTR) {
                                                    switch (numTD) {
                                                        case 1:
                                                            tableHtml += "<td width='23%'>" + ntd.InnerText + "</td>";
                                                            break;
                                                        case 2:
                                                            tableHtml += "<td width='77%'>" + ntd.InnerText + "</td>";
                                                            break;
                                                    }
                                                    numTD++;
                                                }
                                                else {
                                                    tableHtml += "<td>" + ntd.InnerText + "</td>";
                                                }
                                            }
                                        }
                                        firstTR = false;
                                    }
                                    tableHtml += "</tr>";
                                }
                                tableHtml += "</table>";
                            }

                            node = page.DocumentNode.SelectSingleNode("//span[@class='Destacado']");
                            if (node != null) {
                                txt = "";
                                node = node.NextSibling;
                                while (node != null) {
                                    txt += node.InnerHtml;
                                    node = node.NextSibling;
                                }
                            }
                            txtAll = "";
                            if (tableHtml != "" && txt != "") {
                                txtAll = tableHtml + "<p>" + txt + "</p>";
                                isBodyContentSet = true;
                            }
                            else {
                                if (tableHtml != "") {
                                    txtAll = tableHtml;
                                    isBodyContentSet = true;
                                }
                                else {
                                    txtAll = "<p>" + txt + "</p>";
                                }
                            }
                            if (isBodyContentSet) {
                                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                examinedDoc.DocumentText.Content = txtAll;
                                isBodyContentSet = true;
                            }
                            rslt = isTitleSet && isImageOnlySet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                        case 3:
                            // Title
                            var titles = page.DocumentNode.SelectNodes("//span[@class='TituloSecundario']");
                            nodes = titles;
                            if (titles != null && titles.Count >= 2) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = titles[1].InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // Let's try ImageOnly
                            images = page.DocumentNode.SelectNodes("//td/img");
                            imageNameList = new List<String>();
                            if (images != null) {
                                imageNameList = new List<string>();
                                foreach (var img in images) {
                                    var imgSrcRef = img.Attributes["src"];
                                    if (imgSrcRef != null) {
                                        var s = imgSrcRef.Value;
                                        if (s.Contains("BancoMedios/Imagenes")) {
                                            imageNameList.Add(s);
                                        }
                                    }
                                }
                                if (imageNameList.Count != 0) {
                                    examinedDoc.ImageOnlySet.Name = MetadataName.OldDocImageOnlySetName;
                                    examinedDoc.ImageOnlySet.Content = imageNameList[0];
                                    isImageOnlySet = true;
                                }
                            }

                            // DocumentText
                            if (nodes != null) {
                                node = nodes[0].NextSibling;
                                txtAll = "";
                                while (node != null) {
                                    txtAll += node.InnerHtml;
                                    node = node.NextSibling;
                                }
                                if (txtAll != "") {
                                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                    examinedDoc.DocumentText.Content = txtAll;
                                    isBodyContentSet = true;
                                }
                            }

                            rslt = isTitleSet && isImageOnlySet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.ImageOnlySet.Name = examinedDoc.ImageOnlySet.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                    }
                    break;
                case TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_012:
                    switch (kind) {
                        case 1:
                            // Let's try Title
                            title = page.DocumentNode.SelectSingleNode("//td[@class='TituloFicha']");
                            if (title != null) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = title.InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // Let's try DocumentText (here, DocumentText is two parts, one for a table and second a text.
                            // table is to be emitted unformatted.
                            tableHtml = "<table width='100%' border='0' align='center' cellpadding='1' cellspacing='0'>";
                            txt = "";
                            firstTR = true;
                            nodes = page.DocumentNode.SelectNodes("//table[@bgcolor='#202020']/tr");
                            if (nodes != null) {
                                foreach (var n in nodes) {
                                    tableHtml += "<tr>";
                                    if (n.HasChildNodes) {
                                        var children = n.ChildNodes;
                                        int numTD = 1;
                                        foreach (var ntd in children) {
                                            if (ntd.Name.ToUpper() == "TD") {
                                                if (firstTR) {
                                                    switch (numTD) {
                                                        case 1:
                                                            tableHtml += "<td width='23%'>" + ntd.InnerText + "</td>";
                                                            break;
                                                        case 2:
                                                            tableHtml += "<td width='77%'>" + ntd.InnerText + "</td>";
                                                            break;
                                                    }
                                                    numTD++;
                                                }
                                                else {
                                                    tableHtml += "<td>" + ntd.InnerText + "</td>";
                                                }
                                            }
                                        }
                                        firstTR = false;
                                    }
                                    tableHtml += "</tr>";
                                }
                                tableHtml += "</table>";
                            }

                            node = page.DocumentNode.SelectSingleNode("//span[@class='Destacado']");
                            if (node != null) {
                                txt = "";
                                node = node.NextSibling;
                                while (node != null) {
                                    txt += node.InnerHtml;
                                    node = node.NextSibling;
                                }
                            }
                            txtAll = "";
                            if (tableHtml != "" && txt != "") {
                                txtAll = tableHtml + "<p>" + txt + "</p>";
                                isBodyContentSet = true;
                            }
                            else {
                                if (tableHtml != "") {
                                    txtAll = tableHtml;
                                    isBodyContentSet = true;
                                }
                                else {
                                    txtAll = "<p>" + txt + "</p>";
                                }
                            }
                            if (isBodyContentSet) {
                                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                examinedDoc.DocumentText.Content = txtAll;
                                isBodyContentSet = true;
                            }
                            rslt = isTitleSet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                        case 2:
                            // Title
                            title = page.DocumentNode.SelectSingleNode("//span[@class='TituloPrincipal']");
                            if (title != null) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = title.InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // DocumentText
                            if (title != null) {
                                node = title.NextSibling;
                            }
                            else {
                                node = null;
                            }
                            txtAll = "";
                            while (node != null) {
                                if (node.Name == "#text") {
                                    txtAll += node.InnerText;
                                }
                                node = node.NextSibling;
                            }
                            if (txtAll != "") {
                                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                examinedDoc.DocumentText.Content = txtAll;
                                isBodyContentSet = true;
                            }
                            rslt = isTitleSet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                        case 3:
                            // Title
                            node = page.DocumentNode.SelectSingleNode("//td[@class='Vitrinas-Vineta']");
                            if (node != null) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = node.InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // DocumentText
                            txtAll = "";
                            node = page.DocumentNode.SelectSingleNode("//td[@class='VitrinaTexto']");
                            if (node != null) {
                                if (node.HasChildNodes) {
                                    node = node.FirstChild;
                                    while (node != null) {
                                        txtAll += node.InnerHtml;
                                        node = node.NextSibling;
                                    }
                                    if (txtAll != "") {
                                        examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                        examinedDoc.DocumentText.Content = txtAll;
                                        isBodyContentSet = true;
                                    }
                                }
                            }
                            rslt = isTitleSet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                        case 4:
                            // Title
                            var titles = page.DocumentNode.SelectNodes("//span[@class='TituloSecundario']");
                            nodes = titles;
                            if (titles != null && titles.Count >= 2) {
                                Title tit = new Title();
                                tit.Name = MetadataName.OldDocTitleName;
                                tit.Content = titles[1].InnerText;
                                examinedDoc.Title.Add(tit);
                                isTitleSet = true;
                            }

                            // DocumentText
                            if (nodes != null) {
                                node = nodes[0].NextSibling;
                                txtAll = "";
                                while (node != null) {
                                    txtAll += node.InnerHtml;
                                    node = node.NextSibling;
                                }
                                if (txtAll != "") {
                                    examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                                    examinedDoc.DocumentText.Content = txtAll;
                                    isBodyContentSet = true;
                                }
                            }

                            rslt = isTitleSet && isBodyContentSet;
                            if (!rslt) {
                                examinedDoc.DocumentText.Name = examinedDoc.DocumentText.Content = "";
                                examinedDoc.Title.Clear();
                            }
                            break;
                    }
                    break;
            }

            if (rslt) {
                examinedDoc.TemplateType = templateTypeCode;
                examinedDoc.Status = MigrateStatusCode.OLD_DOC_STATUS_CODE_SUCCESS;
            }
            return rslt;
        }
コード例 #10
0
ファイル: ExtractHTML.cs プロジェクト: alpermazlum/try_git
        /// <summary>
        /// Tries to match Title, body Content only and a photo gallery with one footer in it.
        /// NOTE: This method is called only if 'examinedDoc' is empty.
        /// </summary>
        /// <returns>True if a match is found</returns>
        private bool GetTemplateTitleBodyContentPhotoFooterGalleryContent() {
            bool rslt = false;
            var isTitleSet = false;
            var isBodyContentSet = false;
            var isPhotoFooterGallerySet = false;
            var contentTextFirst = "";
            var contentTextSecond = "";

            var titles = page.DocumentNode.SelectNodes("//span[@class='TituloPrincipal']");
            if (titles != null) {
                Title tit = new Title();
                tit.Name = MetadataName.OldDocTitleName;
                tit.Content = titles[0].InnerHtml;
                examinedDoc.Title.Add(tit);
                isTitleSet = true;
            }

            var images = page.DocumentNode.SelectNodes("//img");
            if (images != null) {
                var imageNameList = new List<string>();
                foreach (var img in images) {
                    var imgSrcRef = img.Attributes["src"];
                    if (imgSrcRef != null) {
                        var s = imgSrcRef.Value;
                        if (s.Contains("BancoMedios/Imagenes")) {
                            imageNameList.Add(s);
                        }
                    }
                }
                if (imageNameList != null) {
                    foreach (var img in imageNameList) {
                        PhotoOnly pho = new PhotoOnly();
                        pho.ImageName = MetadataName.OldDocImageName;
                        pho.ImageSrc = img;
                        examinedDoc.PhotoFooterGallery.PhotoList.Add(pho);
                    }
                }
            }

            var footerText = "";
            if (titles != null) {
                if (titles.Count >= 2) {
                    examinedDoc.PhotoFooterGallery.FooterTitleName = MetadataName.OldDocPhotoOnlyGalleryImageFooterTitleName;
                    examinedDoc.PhotoFooterGallery.FoooterTitle = titles[1].InnerHtml.Trim();

                    var node = titles[1];
                    while (node != null) {
                        if (node.Name.ToUpper() == "P") {
                            break;
                        }
                        node = node.NextSibling;
                    }
                    if (node != null) {
                        node = node.NextSibling;
                        while (node != null) {
                            footerText += node.InnerHtml.Replace("\r\n", "").Trim();
                            node = node.NextSibling;
                        }
                        if (footerText != "") {
                            examinedDoc.PhotoFooterGallery.FooterName = MetadataName.OldDocPhotoOnlyGalleryImageFooterName;
                            examinedDoc.PhotoFooterGallery.Footer = footerText;
                        }
                    }
                }
            }
            isPhotoFooterGallerySet = !examinedDoc.PhotoFooterGallery.IsEmpty();

            // Let's retrieve Body content text.
            // In fact for this template, it is layout in two different parts.
            if (titles != null) {
                // First part
                if (titles.Count >= 2) {
                    var node = titles[0];
                    while (node != null) {
                        if (node.Name.ToUpper() == "P") {
                            break;
                        }
                        node = node.NextSibling;
                    }
                    if (node != null) {
                        contentTextFirst = node.InnerHtml;
                    }
                }
            }

            // Second Part retrieval.
            var nodeText = page.DocumentNode.SelectSingleNode("//p[@align='left']");
            if (nodeText != null) {
                contentTextSecond = nodeText.InnerHtml;
            }

            if (contentTextFirst != "" && contentTextSecond != "") {
                var s = contentTextFirst + contentTextSecond;
                examinedDoc.DocumentText.Name = MetadataName.OldDocTextName;
                examinedDoc.DocumentText.Content = s.Trim();
                isBodyContentSet = true;
            }

            rslt = isTitleSet && isPhotoFooterGallerySet && isBodyContentSet;
            if (rslt) {
                useTemplateTypeInstead = TemplateTypeCode.OLD_DOC_TEMPLATE_TYPE_015;
            }
            else {
                // Clear any contents found so far as none of all items were found.
                examinedDoc.Reset();
            }
            return rslt;
        }