/// <summary> /// Derive the structure of a Wikimedia page by its sections, subsections, and down the hierarchy. /// </summary> /// <param name="title">The title of the section being parsed</param> /// <param name="markup">The Wikimedia markup to be parsed</param> /// <param name="level">The numeric level of the hierarchy (eg, "==History==" is at the second level)</param> /// <returns>A tree of WikiSections</returns> internal WikiSection ParseSection(string title, string markup, int level = 2 /* language starts at two dashes (eg, '==English==') */) { const string RawPattern = "^={{{0}}}([^=]+?)={{{0}}}[\r\n]+"; var pattern = string.Format(RawPattern, level); var matches = Regex.Matches(markup, pattern, RegexOptions.Multiline); if (matches.Count == 0) { return(new WikiSection { SectionName = title, Content = markup, SubSections = new List <WikiSection>() }); } var section = new WikiSection { SectionName = title, SubSections = new List <WikiSection>() }; if (matches[0].Index > 0) { section.Content = markup.Substring(0, matches[0].Index); } for (var i = 0; i < matches.Count; i++) { var subsectionName = matches[i].Groups[1].Value; var contentStartIndex = matches[i].Length + matches[i].Index; var subsectionContent = i == matches.Count - 1 ? markup.Substring(contentStartIndex) // last match, just take everything else : markup.Substring(contentStartIndex, matches[i + 1].Index - contentStartIndex); // take everything from this match (after the heading) to the beginning of the next match section.SubSections.Add(ParseSection(subsectionName, subsectionContent, level + 1)); } return(section); }
/// <summary> /// Derive the structure of a Wikimedia page by its sections, subsections, and down the hierarchy. /// </summary> /// <param name="title">The title of the section being parsed</param> /// <param name="markup">The Wikimedia markup to be parsed</param> /// <param name="level">The numeric level of the hierarchy (eg, "==History==" is at the second level)</param> /// <returns>A tree of WikiSections</returns> internal WikiSection ParseSection(string title, string markup, int level = 2 /* language starts at two dashes (eg, '==English==') */ ) { const string RawPattern = "^={{{0}}}([^=]+?)={{{0}}}[\r\n]+"; var pattern = string.Format(RawPattern, level); var matches = Regex.Matches(markup, pattern, RegexOptions.Multiline); if (matches.Count == 0) { return new WikiSection { SectionName = title, Content = markup, SubSections = new List<WikiSection>() }; } var section = new WikiSection { SectionName = title, SubSections = new List<WikiSection>() }; if (matches[0].Index > 0) { section.Content = markup.Substring(0, matches[0].Index); } for (var i = 0; i < matches.Count; i++) { var subsectionName = matches[i].Groups[1].Value; var contentStartIndex = matches[i].Length + matches[i].Index; var subsectionContent = i == matches.Count - 1 ? markup.Substring(contentStartIndex) // last match, just take everything else : markup.Substring(contentStartIndex, matches[i + 1].Index - contentStartIndex); // take everything from this match (after the heading) to the beginning of the next match section.SubSections.Add(ParseSection(subsectionName, subsectionContent, level + 1)); } return section; }