예제 #1
0
            private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
                                                        IDictionary <IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument)
            {
                Debug.Assert(pageNode.IsPage);

                var pageDictionary = new Dictionary <NameToken, IToken>
                {
                    { NameToken.Parent, parentPagesObject },
                };

                foreach (var setPair in pageNode.NodeDictionary.Data)
                {
                    var name  = setPair.Key;
                    var token = setPair.Value;

                    if (name == NameToken.Parent)
                    {
                        // Skip Parent token, since we have to reassign it
                        continue;
                    }

                    pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
                }

                return(context.WriteToken(new DictionaryToken(pageDictionary)));
            }
예제 #2
0
        public void WriteToXmlFiles(SiteScrape siteScrape)
        {
            PageTreeNode pageTreeNode = siteScrape.Homepage;

            var pageStructureElementsElements = new XElement("PageStructureElementsElements");

            pageStructureElementsElements.Add(GetPageStructureElements(pageTreeNode, Guid.Empty, 0));
            Save(pageStructureElementsElements, "Composite.Data.Types.IPageStructure.xml");

            foreach (var culture in pageTreeNode.PagesLocalized.Keys)
            {
                var pageElementsElements = new XElement("PageElementsElements");
                pageElementsElements.Add(GetPageElements(pageTreeNode, culture));
                Save(pageElementsElements, $"Composite.Data.Types.IPage_{culture}.xml");
                Save(pageElementsElements, $"Composite.Data.Types.IPage_Unpublished_{culture}.xml");

                var pagePlaceholderContentElementsElements = new XElement("PagePlaceholderContentElementsElements");
                pagePlaceholderContentElementsElements.Add(GetPagePlaceholderContentElements(pageTreeNode, culture));
                Save(pagePlaceholderContentElementsElements, $"Composite.Data.Types.IPagePlaceholderContent_{culture}.xml");
                Save(pagePlaceholderContentElementsElements, $"Composite.Data.Types.IPagePlaceholderContent_Unpublished_{culture}.xml");
            }


            var mediaFileDataElementsElements = new XElement("MediaFileDataElementsElements");

            mediaFileDataElementsElements.Add(GetMediaFileDataElements(siteScrape.Files));
            Save(mediaFileDataElementsElements, "Composite.Data.Types.IMediaFileData.xml");


            var mediaFolderDataElementsElements = new XElement("MediaFolderDataElementsElements");
            var folders = mediaFileDataElementsElements.Elements().Attributes("FolderPath").Select(f => f.Value).Distinct();

            mediaFolderDataElementsElements.Add(GetMediaFolderDataElements(folders));
            Save(mediaFolderDataElementsElements, "Composite.Data.Types.IMediaFolderData.xml");
        }
예제 #3
0
        private IEnumerable <XElement> GetPagePlaceholderContentElements(PageTreeNode pageTreeNode, CultureInfo culture)
        {
            if (!pageTreeNode.PagesLocalized.ContainsKey(culture))
            {
                yield break;
            }

            var localizedPageContent = pageTreeNode.PagesLocalized[culture];

            foreach (var item in localizedPageContent.PlaceholderContent)
            {
                yield return(new XElement("PagePlaceholderContentElements",
                                          new XAttribute("PublicationStatus", "published"),
                                          new XAttribute("ChangeDate", DateTime.Now),
                                          new XAttribute("CreationDate", DateTime.Now),
                                          new XAttribute("ChangedBy", "import"),
                                          new XAttribute("CreatedBy", "import"),
                                          new XAttribute("PageId", pageTreeNode.Id),
                                          new XAttribute("PlaceHolderId", item.Key),
                                          new XAttribute("Content", GetXhtmlDocument(item.Value).ToString()),
                                          new XAttribute("SourceCultureName", culture),
                                          new XAttribute("VersionId", pageTreeNode.Id)
                                          ));
            }

            foreach (var child in pageTreeNode.ChildNodes)
            {
                var subTree = GetPagePlaceholderContentElements(child, culture);
                foreach (var item in subTree)
                {
                    yield return(item);
                }
            }
        }
예제 #4
0
        public SiteScrape Scrape(Dictionary <CultureInfo, Uri> localizedHomepages)
        {
            _validHosts.AddRange(localizedHomepages.Values.Select(f => f.Host).Distinct());

            // sanity checking we get the same ID from multiple homepages
            if (localizedHomepages.Select(f => GetPageIdFromUri(f.Value)).Distinct().Count() != 1)
            {
                throw new InvalidOperationException("Getting different Page ID values from the analyzer, when feeding homepage URLs");
            }

            _topPageTreeNode = new PageTreeNode {
                Id = GetPageIdFromUri(localizedHomepages.First().Value), Depth = 0
            };

            foreach (var culture in localizedHomepages.Keys)
            {
                ResolveNodeStructured(localizedHomepages[culture], culture, _topPageTreeNode, "homepage");
                ResolveNodeRest(localizedHomepages[culture], culture, _topPageTreeNode);
            }

            SiteScrape result = new SiteScrape {
                Homepage = _topPageTreeNode, Files = new Files {
                    CachedFiles = _downloadCache
                }
            };

            return(result);
        }
예제 #5
0
        public Guid GetPageTemplateId(PageTreeNode pageTreeNode, CultureInfo culture)
        {
            if (pageTreeNode.Depth == 0)
            {
                return(new Guid("a270f819-0b5c-4f7e-9194-4b554043e4ab")); // Venus: Front page
            }
            if (pageTreeNode.Depth == 1 && !pageTreeNode.ChildNodes.Any())
            {
                if (pageTreeNode.PagesLocalized[culture].PlaceholderContent.ContainsKey("aside"))
                {
                    return(new Guid("9f096519-d21c-435e-b334-62224fde2ab3")); // Venus: Page with right aside (no navigation)
                }
                return(new Guid("0526ad34-c540-418e-8c23-0eec2a8da2ce"));     // Venus: Page (no aside or left navigation)
            }

            if (pageTreeNode.PagesLocalized.ContainsKey(culture))
            {
                if (pageTreeNode.PagesLocalized[culture].PlaceholderContent.ContainsKey("aside"))
                {
                    return(new Guid("53851f7a-3f4b-4eda-9708-0743b6020e68"));                                                                              // Venus: Page with navigation and right aside
                }
            }

            return(new Guid("e3851f7a-3f4b-4eda-9708-07c3b6020e08")); // Venus: Page with navigation
        }
예제 #6
0
        private void ResolveNodeRest(Uri uri, CultureInfo culture, PageTreeNode pageTreeNode)
        {
            if (!_validHosts.Contains(uri.Host))
            {
                return;
            }

            var linkElements = DocCache(uri).Descendants().Where(e => e.Name == xhtmlNs + "a" && e.Attribute("href") != null);

            var linkAttributes = DocCache(uri).Descendants().Attributes().Where(f => f.Name == "href" || f.Name == "src");

            foreach (var linkAttribute in linkAttributes)
            {
                Uri link = new Uri(uri, linkAttribute.Value);
                if (_validHosts.Contains(link.Host) && !_visited.Contains(link))
                {
                    _visited.Add(link);

                    if (IsHtml(link))
                    {
                        var referencedPageNode = GetNodeByUri(link);
                        if (referencedPageNode == null)
                        {
                            referencedPageNode = new PageTreeNode {
                                Id = GetPageIdFromUri(link), Depth = pageTreeNode.Depth + 1
                            };
                            pageTreeNode.ChildNodes.Add(referencedPageNode);
                            ResolveNodeStructured(link, culture, referencedPageNode, linkAttribute.Parent.Value);
                        }
                        ResolveNodeRest(link, culture, referencedPageNode);
                    }
                }
            }
        }
예제 #7
0
        private PageTreeNode GetNodeByUri(Uri uri)
        {
            Guid pageId = GetPageIdFromUri(uri);

            PageTreeNode match = GetNodeById(pageId, _topPageTreeNode);

            return(match);
        }
예제 #8
0
        public Guid GetPageTypeId(PageTreeNode pageTreeNode)
        {
            if (pageTreeNode.Depth == 0)
            {
                return(new Guid("de22fed1-0729-4ad3-aa1c-6047e54bf429")); // "Home" page type
            }

            return(new Guid("f7869eb2-7369-4eb2-af47-e3be261e92c7")); // "Page" page type
        }
예제 #9
0
            private (ObjectToken, int) CopyPagesTree(PageTreeNode treeNode, IndirectReferenceToken treeParentReference, IPdfTokenScanner tokenScanner)
            {
                Debug.Assert(!treeNode.IsPage);

                var currentNodeReserved  = context.ReserveNumber();
                var currentNodeReference = new IndirectReferenceToken(new IndirectReference(currentNodeReserved, 0));

                var pageReferences = new List <IndirectReferenceToken>();
                var nodeCount      = 0;

                foreach (var pageNode in treeNode.Children)
                {
                    ObjectToken newEntry;
                    if (!pageNode.IsPage)
                    {
                        var count = 0;
                        (newEntry, count) = CopyPagesTree(pageNode, currentNodeReference, tokenScanner);
                        nodeCount        += count;
                    }
                    else
                    {
                        newEntry = CopyPageNode(pageNode, currentNodeReference, tokenScanner);
                        ++nodeCount;
                    }

                    pageReferences.Add(new IndirectReferenceToken(newEntry.Number));
                }

                var newPagesNode = new Dictionary <NameToken, IToken>
                {
                    { NameToken.Type, NameToken.Pages },
                    { NameToken.Kids, new ArrayToken(pageReferences) },
                    { NameToken.Count, new NumericToken(nodeCount) },
                    { NameToken.Parent, treeParentReference }
                };

                foreach (var pair in treeNode.NodeDictionary.Data)
                {
                    if (IgnoreKeyForPagesNode(pair))
                    {
                        continue;
                    }

                    newPagesNode[NameToken.Create(pair.Key)] = CopyToken(pair.Value, tokenScanner);
                }

                var pagesDictionary = new DictionaryToken(newPagesNode);

                return(context.WriteObject(memory, pagesDictionary, currentNodeReserved), nodeCount);
            }
예제 #10
0
        private void ResolveNodeStructured(Uri uri, CultureInfo culture, PageTreeNode pageTreeNode, string menuTitle)
        {
            if (!_validHosts.Contains(uri.Host))
            {
                return;
            }

            if (pageTreeNode.PagesLocalized.ContainsKey(culture))
            {
                return;
            }

            var pageContent = GetPageContent(uri, culture, menuTitle);

            pageTreeNode.PagesLocalized.Add(culture, pageContent);

            var linksSection = _contentParser.GetStructuredNavigationElements(uri, DocCache(uri)).Where(f => f != null);
            var linkElements = linksSection.Descendants().Where(e => e.Name == xhtmlNs + "a" && e.Attribute("href") != null);

            foreach (var aElement in linkElements)
            {
                Uri link = new Uri(uri, aElement.Attribute("href").Value);
                if (IsHtml(link))
                {
                    var referencedPageNode = GetNodeByUri(link);
                    if (referencedPageNode == null)
                    {
                        referencedPageNode = new PageTreeNode {
                            Id = GetPageIdFromUri(link), Depth = pageTreeNode.Depth + 1
                        };
                        pageTreeNode.ChildNodes.Add(referencedPageNode);
                    }
                }
            }
            foreach (var aElement in linkElements)
            {
                Uri link = new Uri(uri, aElement.Attribute("href").Value);
                if (IsHtml(link))
                {
                    string lineMenuTitle      = aElement.Value.Trim();
                    var    referencedPageNode = GetNodeByUri(link);
                    ResolveNodeStructured(link, culture, referencedPageNode, lineMenuTitle);
                }
            }
        }
예제 #11
0
        private string GetInternalPathByImportUri(Uri sourceUri, PageTreeNode pageTreeNode)
        {
            if (pageTreeNode.PagesLocalized.Any(f => f.Value.SourceUri == sourceUri))
            {
                return($"~/page({pageTreeNode.Id})");
            }

            foreach (var child in pageTreeNode.ChildNodes)
            {
                var internalPath = GetInternalPathByImportUri(sourceUri, child);
                if (internalPath != null)
                {
                    return(internalPath);
                }
            }

            return(null);
        }
예제 #12
0
        private IEnumerable <XElement> GetPageStructureElements(PageTreeNode pageTreeNode, Guid parentId, int position)
        {
            yield return(new XElement("PageStructureElements",
                                      new XAttribute("Id", pageTreeNode.Id),
                                      new XAttribute("ParentId", parentId),
                                      new XAttribute("LocalOrdering", position)
                                      ));

            int childCounter = 0;

            foreach (var child in pageTreeNode.ChildNodes)
            {
                var subTree = GetPageStructureElements(child, pageTreeNode.Id, childCounter++);
                foreach (var item in subTree)
                {
                    yield return(item);
                }
            }
        }
예제 #13
0
        private PageTreeNode GetNodeById(Guid pageId, PageTreeNode nodeToCheck)
        {
            if (nodeToCheck != null)
            {
                if (nodeToCheck.Id == pageId)
                {
                    return(nodeToCheck);
                }
                foreach (var child in nodeToCheck.ChildNodes)
                {
                    var nodeMatch = GetNodeById(pageId, child);
                    if (nodeMatch != null)
                    {
                        return(nodeMatch);
                    }
                }
            }

            return(null);
        }
예제 #14
0
        private void MakePathsInternal(PageTreeNode pageTreeNode)
        {
            if (_rootPageTreeNode == null)
            {
                _rootPageTreeNode = pageTreeNode;
            }

            foreach (var localizedPage in pageTreeNode.PagesLocalized)
            {
                Uri pageUri = localizedPage.Value.SourceUri;

                foreach (var placeholderNodes in localizedPage.Value.PlaceholderContent.Values)
                {
                    foreach (XNode contentNode in placeholderNodes)
                    {
                        if (contentNode is XElement)
                        {
                            var contentElement      = (XElement)contentNode;
                            var referenceAttributes = contentElement.DescendantsAndSelf().Attributes("href").Concat(contentElement.DescendantsAndSelf().Attributes("src"));

                            foreach (var referenceAttribute in referenceAttributes)
                            {
                                var fullUri      = new Uri(pageUri, (string)referenceAttribute);
                                var internalPath = GetInternalPathByImportUri(fullUri);
                                if (internalPath != null)
                                {
                                    referenceAttribute.Value = internalPath;
                                }
                            }
                        }
                    }
                }
            }

            foreach (var item in pageTreeNode.ChildNodes)
            {
                MakePathsInternal(item);
            }
        }
예제 #15
0
        private IEnumerable <XElement> GetPageElements(PageTreeNode pageTreeNode, CultureInfo culture)
        {
            if (!pageTreeNode.PagesLocalized.ContainsKey(culture))
            {
                yield break;
            }

            var localizedPageContent = pageTreeNode.PagesLocalized[culture];

            yield return(new XElement("PageElements",
                                      new XAttribute("PublicationStatus", "published"),
                                      new XAttribute("ChangeDate", DateTime.Now),
                                      new XAttribute("CreationDate", DateTime.Now),
                                      new XAttribute("ChangedBy", "import"),
                                      new XAttribute("CreatedBy", "import"),
                                      new XAttribute("Id", pageTreeNode.Id),
                                      new XAttribute("TemplateId", _templateChooser.GetPageTemplateId(pageTreeNode, culture)),
                                      new XAttribute("PageTypeId", _templateChooser.GetPageTypeId(pageTreeNode)),
                                      new XAttribute("Title", localizedPageContent.Title),
                                      new XAttribute("MenuTitle", localizedPageContent.MenuTitle),
                                      new XAttribute("UrlTitle", localizedPageContent.UrlTitle),
                                      new XAttribute("FriendlyUrl", ""),
                                      new XAttribute("Description", localizedPageContent.Description),
                                      new XAttribute("SourceCultureName", culture),
                                      new XAttribute("VersionId", pageTreeNode.Id)
                                      ));

            foreach (var child in pageTreeNode.ChildNodes)
            {
                var subTree = GetPageElements(child, culture);
                foreach (var item in subTree)
                {
                    yield return(item);
                }
            }
        }
예제 #16
0
        private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput,
                                                     DictionaryToken nodeDictionaryInput,
                                                     IndirectReference parentReferenceInput,
                                                     bool isRoot,
                                                     IPdfTokenScanner pdfTokenScanner,
                                                     bool isLenientParsing,
                                                     PageCounter pageNumber)
        {
            bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing);

            if (isPage)
            {
                pageNumber.Increment();

                return(new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray <PageTreeNode> .Instance));
            }

            //If we got here, we have to iterate till we manage to exit

            var toProcess =
                new Queue <(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
                            List <PageTreeNode> nodeChildren)>();
            var firstPage         = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null);
            var setChildren       = new List <Action>();
            var firstPageChildren = new List <PageTreeNode>();

            setChildren.Add(() => firstPage.WithChildren(firstPageChildren));

            toProcess.Enqueue(
                (thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput,
                 nodeChildren: firstPageChildren));

            do
            {
                var current = toProcess.Dequeue();

                if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
                {
                    if (!isLenientParsing)
                    {
                        throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}.");
                    }

                    kids = new ArrayToken(EmptyArray <IToken> .Instance);
                }

                foreach (var kid in kids.Data)
                {
                    if (!(kid is IndirectReferenceToken kidRef))
                    {
                        throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
                    }

                    if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken))
                    {
                        throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
                    }

                    bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing);

                    if (isChildPage)
                    {
                        pageNumber.Increment();

                        var kidPageNode =
                            new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray <PageTreeNode> .Instance);
                        current.nodeChildren.Add(kidPageNode);
                    }
                    else
                    {
                        var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null);
                        var kidChildren  = new List <PageTreeNode>();
                        toProcess.Enqueue(
                            (thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference,
                             nodeChildren: kidChildren));

                        setChildren.Add(() => kidChildNode.WithChildren(kidChildren));

                        current.nodeChildren.Add(kidChildNode);
                    }
                }
            } while (toProcess.Count > 0);

            foreach (var action in setChildren)
            {
                action();
            }

            return(firstPage);
        }
예제 #17
0
 public UriRewriter(SiteScrape scrape)
 {
     _rootPageTreeNode = scrape.Homepage;
     _files            = scrape.Files;
 }
예제 #18
0
        private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary,
                                                     IndirectReference parentReference,
                                                     bool isRoot,
                                                     IPdfTokenScanner pdfTokenScanner,
                                                     bool isLenientParsing,
                                                     ref int pageNumber)
        {
            var isPage = false;

            if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
            {
                if (!isLenientParsing)
                {
                    throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}.");
                }

                if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _))
                {
                    isPage = true;
                }
            }
            else
            {
                isPage = type.Equals(NameToken.Page);

                if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing)
                {
                    throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}.");
                }
            }

            if (!isLenientParsing && !isRoot)
            {
                if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken))
                {
                    throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}.");
                }

                if (!parentReferenceToken.Data.Equals(parentReference))
                {
                    throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}.");
                }
            }

            if (isPage)
            {
                pageNumber++;

                var thisNode = new PageTreeNode(nodeDictionary, reference, true,
                                                pageNumber,
                                                EmptyArray <PageTreeNode> .Instance);

                return(thisNode);
            }

            if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
            {
                if (!isLenientParsing)
                {
                    throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}.");
                }

                kids = new ArrayToken(EmptyArray <IToken> .Instance);
            }

            var nodeChildren = new List <PageTreeNode>();

            foreach (var kid in kids.Data)
            {
                if (!(kid is IndirectReferenceToken kidRef))
                {
                    throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
                }

                if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken))
                {
                    throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
                }

                var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber);

                nodeChildren.Add(kidNode);
            }

            return(new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren));
        }
예제 #19
0
        internal static IEnumerable <(DictionaryToken, IReadOnlyList <DictionaryToken>)> WalkTree(PageTreeNode node, List <DictionaryToken> parents = null)
        {
            if (parents == null)
            {
                parents = new List <DictionaryToken>();
            }

            if (node.IsPage)
            {
                yield return(node.NodeDictionary, parents);

                yield break;
            }

            parents = parents.ToList();
            parents.Add(node.NodeDictionary);
            foreach (var child in node.Children)
            {
                foreach (var item in WalkTree(child, parents))
                {
                    yield return(item);
                }
            }
        }