// Read one HTML page and return a slide show public SlideShow ReadSlideShow(string aSlideFile, out string aDiagnostic) { aDiagnostic = null; // Determine the name of the future XML slide show file string xmlFilePath = aSlideFile.Replace(".htm", ".xml"); SlideShow slideShow = new SlideShow(xmlFilePath); //Console.WriteLine(" HtmlReader ReadSlideShow: parsing " + aSlideFile); string html = ReadFile(aSlideFile); if (html == null) { aDiagnostic = "ReadSlideShow: bad HTML slideshow file " + aSlideFile; return(null); } else { ParseHTML parse = new ParseHTML(); parse.Source = html; HtmlPreprocess htmlPreprocess = new HtmlPreprocess(); // Default overall title for the slide show, hopefully replaced with something better string title = "A most peculiar day"; bool collectingTitle = false; bool collectingCaption = false; Caption caption = new Caption(); string link = ""; while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { AttributeList tag = parse.GetTag(); if (tag.Name.Equals("title", StringComparison.CurrentCultureIgnoreCase)) { collectingTitle = true; // Start collecting title title = string.Empty; } else if (tag.Name.Equals("/title", StringComparison.CurrentCultureIgnoreCase)) { collectingTitle = false; // Title now complete slideShow.Title = title.Trim(); } if (tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase)) { collectingCaption = true; // Start collecting new caption caption = new Caption(); } else if (tag.Name.Equals("/td", StringComparison.CurrentCultureIgnoreCase)) { collectingCaption = false; // Any caption is now complete if (!link.Equals("")) { // Got a link to go with the caption slideShow.Add(link, caption); link = ""; } } else if (collectingCaption && tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) { // HTML paragraph tag within caption caption.NewLine(); } else if (tag["href"] != null) { string href = tag["href"].Value.Replace('/', '\\'); if (IsPhoto(href)) { //Console.WriteLine(" + HtmlReader ReadSlideShow: add " + href + // " from tag " + tag.Name); link = href; } } // Preprocessing of regular character stream starts with clean sheet after tag htmlPreprocess.Reset(); } else { // Got a character ch = htmlPreprocess.Add(ch); if (ch != HtmlPreprocess.NullChar) { if (collectingTitle) { title += ch; } else if (collectingCaption) { caption.AddChar(ch); } } } } return(slideShow); } }
// Read one HTML page and return an Event list public EventList ReadEvents(string aEventsFile, out string aDiagnostic) { aDiagnostic = null; string xmlFilePath = aEventsFile.Replace(".htm", ".xml"); EventList events = new EventList(xmlFilePath); string html = ReadFile(aEventsFile); if (html == null) { aDiagnostic = "ReadEvents: bad HTML Events file " + aEventsFile; return(null); } else { // Determine the events directory string eventsDirectory = GetDirectory(aEventsFile); //Console.WriteLine(" HtmlReader ReadEvents: parsing " + aEventsFile); ParseHTML parse = new ParseHTML(); parse.Source = html; HtmlPreprocess htmlPreprocess = new HtmlPreprocess(); string name = ""; // Collect stream of characters in HTML source int indent = 0; SlideShow slideShow = null; // Collect slide show from href while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { AttributeList tag = parse.GetTag(); if (tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase)) { // Start collecting title name = ""; indent = 0; } else if (tag.Name.Equals("/h2", StringComparison.CurrentCultureIgnoreCase)) { // Title is now complete events.Title = name.Trim(); name = ""; indent = 0; } else if ((tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) || (tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase)) || (tag.Name.Equals("hr", StringComparison.CurrentCultureIgnoreCase))) { // End of line, check whether we have an event if (name.Length > 0) { // Use indent as the level for now events.Add(indent, name.Trim(), slideShow); } // Reset for next event name = ""; indent = 0; slideShow = null; } else if (tag["href"] != null) { string href = tag["href"].Value.Replace('/', '\\'); //Console.WriteLine(" + HtmlReader ReadEvents: add event " + href + " to events XML file"); // Strip any anchor: we cannot handle it if (href.Contains('#')) { href = href.Remove(href.IndexOf('#')); } // Process child events file slideShow = ReadSlideShow(eventsDirectory + href, out aDiagnostic); if (slideShow == null) { return(null); } } // Preprocessing of regular character stream starts with clean sheet after tag htmlPreprocess.Reset(); } else { // Preprocess ch = htmlPreprocess.Add(ch); if (ch == HtmlPreprocess.NullChar) { // Nothing to do continue; } else if (ch.Equals(' ')) { if (name.Length == 0) { // Leading space: count the indent indent++; } else { // Count all non-leading spaces returned by the preprocessor name += ch; } } else if (ch.Equals('+')) { if (name.Length == 0) { // Initial plus marks a subevent - count it in the indent indent++; } else { // Transcribe other plus symbols into the event name name += ch; } } else { // Add regular character name += ch; } } } // End of event list, check for any outstanding event if (name.Length > 0) { events.Add(indent, name.Trim(), slideShow); } // The event levels were arbitrarily set as a measure of the indentation // of each event name in the HTML. Reassign sequential levels. events.Relevel(); return(events); } }