Exemplo n.º 1
0
        public HtmlDocument CreateHtml(WikiPageRecord wikiPageRecord)
        {
            HtmlDocument newDocument = new HtmlDocument();
            var          initNode    =
                HtmlNode.CreateNode($"<html><head><meta charset=\"utf-8\"><title>{wikiPageRecord.Id.Replace('_', ' ')}</title></head><body></body></html>");

            newDocument.DocumentNode.AppendChild(initNode);
            var bodyNode = newDocument.DocumentNode.SelectSingleNode("/html/body");

            bool nodePredicate(HtmlNode node) => node.Name != "style" &&
            !(node.Name == "style" ||
              node.Descendants().Any(d => d.Attributes.Any(a => a.Value == "navigation" | a.Value == "vertical-navbox nowraplinks hlist")));

            var childNodes = wikiPageRecord.htmlDoc
                             .DocumentNode
                             .SelectSingleNode("//html/body")
                             .ChildNodes;

            childNodes.AsParallel().AsOrdered().ToList().ForEach(node =>
            {
                if (nodePredicate(node))
                {
                    ChangeHyperLinks(node);
                    ChangeDownloadLinks(node, wikiPageRecord.SrcMap);
                    bodyNode.AppendChild(node);
                }
            });

            return(newDocument);
        }
Exemplo n.º 2
0
 public async Task <IDocument> CreateHtmlDoc(WikiPageRecord wikiPageRecord, string filePath)
 {
     return(await Task.Run(() =>
     {
         return new HtmlDoc(CreateHtml(wikiPageRecord), filePath, wikiPageRecord);
     }));
 }
Exemplo n.º 3
0
        public void InitialiseTest()
        {
            var seanWikiDoc       = webGet.Load("https://en.wikipedia.org/wiki/Sean_Connery");
            var physioWikiDoc     = webGet.Load("https://en.wikipedia.org/wiki/Physical_therapy");
            var physiologyWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Physiology");
            var paperWikiDoc      = webGet.Load("https://en.wikipedia.org/wiki/Page_(paper)");
            var markWikiDoc       = webGet.Load("https://en.wikipedia.org/wiki/Mark_Lawrence_(cricketer)");

            seanRecord         = GetWikiPageRecords.From(seanWikiDoc, imageDir);
            physioRecord       = GetWikiPageRecords.From(physioWikiDoc, imageDir);
            physiologyRecord   = GetWikiPageRecords.From(physiologyWikiDoc, imageDir);
            paperRecord        = GetWikiPageRecords.From(paperWikiDoc, imageDir);
            markLawrenceRecord = GetWikiPageRecords.From(markWikiDoc, imageDir);

            wikiPages = new() { seanRecord, physioRecord, physiologyRecord, paperRecord, markLawrenceRecord };
            //wikiPages.ForEach(page => page.SrcMap.ToList().ForEach(item => Console.WriteLine($"{item.Key} -- {item.Value}")));
        }
Exemplo n.º 4
0
 public async Task <(HtmlDocument doc, WikiPageRecord record)> ParseAsync(HtmlDocument htmlDocument, WikiPageRecord wikiPageRecord) =>
Exemplo n.º 5
0
 public HtmlDoc(HtmlDocument document, string directory, WikiPageRecord wikiRecord) : base(document, directory)
 {
     _wikiRecord = wikiRecord;
 }
Exemplo n.º 6
0
 /// <summary>
 /// Downloads each image from the html file, and saves it to the source specified in the records src mapping.
 /// </summary>
 /// <remarks>
 /// Src mapping from record includes: old src (download url) -> new src (local file directory).
 /// Switch statement handles the various image sources and returns an appropriate url.
 /// If an unknown src is encountered, no image is downloaded and it is written out to the console.
 /// </remarks>
 /// <returns>Task which represents a completed download for each image in a record</returns>
 public IEnumerable <Task> DownloadImages(WikiPageRecord pageRecord, Dictionary <Directories, string> directories) =>
 pageRecord
 .SrcMap
 .AsParallel()
 .WithDegreeOfParallelism(10)
 .Select(async imgSrc =>