public HtmlDocument CreateHtml(WikiPageRecord wikiPageRecord) { HtmlDocument newDocument = new HtmlDocument(); var initNode = HtmlNode.CreateNode($"<html><head><meta charset=\"utf-8\"><title>{wikiPageRecord.Id.Replace('_', ' ')}</title></head><body></body></html>"); newDocument.DocumentNode.AppendChild(initNode); var bodyNode = newDocument.DocumentNode.SelectSingleNode("/html/body"); bool nodePredicate(HtmlNode node) => node.Name != "style" && !(node.Name == "style" || node.Descendants().Any(d => d.Attributes.Any(a => a.Value == "navigation" | a.Value == "vertical-navbox nowraplinks hlist"))); var childNodes = wikiPageRecord.htmlDoc .DocumentNode .SelectSingleNode("//html/body") .ChildNodes; childNodes.AsParallel().AsOrdered().ToList().ForEach(node => { if (nodePredicate(node)) { ChangeHyperLinks(node); ChangeDownloadLinks(node, wikiPageRecord.SrcMap); bodyNode.AppendChild(node); } }); return(newDocument); }
public async Task <IDocument> CreateHtmlDoc(WikiPageRecord wikiPageRecord, string filePath) { return(await Task.Run(() => { return new HtmlDoc(CreateHtml(wikiPageRecord), filePath, wikiPageRecord); })); }
public void InitialiseTest() { var seanWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Sean_Connery"); var physioWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Physical_therapy"); var physiologyWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Physiology"); var paperWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Page_(paper)"); var markWikiDoc = webGet.Load("https://en.wikipedia.org/wiki/Mark_Lawrence_(cricketer)"); seanRecord = GetWikiPageRecords.From(seanWikiDoc, imageDir); physioRecord = GetWikiPageRecords.From(physioWikiDoc, imageDir); physiologyRecord = GetWikiPageRecords.From(physiologyWikiDoc, imageDir); paperRecord = GetWikiPageRecords.From(paperWikiDoc, imageDir); markLawrenceRecord = GetWikiPageRecords.From(markWikiDoc, imageDir); wikiPages = new() { seanRecord, physioRecord, physiologyRecord, paperRecord, markLawrenceRecord }; //wikiPages.ForEach(page => page.SrcMap.ToList().ForEach(item => Console.WriteLine($"{item.Key} -- {item.Value}"))); }
public async Task <(HtmlDocument doc, WikiPageRecord record)> ParseAsync(HtmlDocument htmlDocument, WikiPageRecord wikiPageRecord) =>
public HtmlDoc(HtmlDocument document, string directory, WikiPageRecord wikiRecord) : base(document, directory) { _wikiRecord = wikiRecord; }
/// <summary> /// Downloads each image from the html file, and saves it to the source specified in the records src mapping. /// </summary> /// <remarks> /// Src mapping from record includes: old src (download url) -> new src (local file directory). /// Switch statement handles the various image sources and returns an appropriate url. /// If an unknown src is encountered, no image is downloaded and it is written out to the console. /// </remarks> /// <returns>Task which represents a completed download for each image in a record</returns> public IEnumerable <Task> DownloadImages(WikiPageRecord pageRecord, Dictionary <Directories, string> directories) => pageRecord .SrcMap .AsParallel() .WithDegreeOfParallelism(10) .Select(async imgSrc =>