/// <summary> /// Extract a section and its subsections (and their subsections, etc.) from a list of lines /// </summary> /// <param name="lines">Lines to extract from</param> /// <param name="parent">Parent section for top-level sections</param> /// <param name="containingPage">Page that contains these sections</param> /// <param name="sectionLayout">Layout of sections, as produced by Section.Layout.</param> /// <returns>List of Section instances</returns> public static List <Section> ExtractSections(List <string> lines, Section parent, Page containingPage, string sectionLayout) { List <Section> sections = new List <Section>(); XmlParser p = new XmlParser(sectionLayout); while (true) { string def = p.OuterXML("section"); if (def == null) { break; } XmlParser secParser = new XmlParser(def); int parentStart = parent != null ? parent.SectionStart : 0; int absStart = int.Parse(secParser.AttributeValue("section", "start")); int absEnd = int.Parse(secParser.AttributeValue("section", "end")); int numLines = absEnd - absStart + 1; int start = absStart - parentStart; int end = start + numLines - 1; List <string> secLines = Section.ExtractLines(lines, start, end); sections.Add(new Section(secLines, def, parent, containingPage)); } return(sections); }
/// <summary> /// Constructor /// </summary> /// <param name="title">Title of page</param> /// <param name="ns">Namespace of this page</param> /// <param name="ID">ID for this page</param> /// <param name="wikiText">Wiki text for page</param> /// <param name="database">Database containing this page</param> /// <param name="followRedirection">Whether or not to follow redirection</param> public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection) : this(title, ns, ID) { // remove irrelevant markup wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText); // split page up into lines _lines = Section.GetLines(wikiText); int firstSectionStart = Section.GetNextSectionStart(_lines, 0); List <string> headerLines = Section.ExtractLines(_lines, 0, firstSectionStart - 1); if (headerLines.Count > 0) { Header h = new Header(headerLines, this); _sections.Add(h); } // get sections _sections.AddRange(Section.ExtractSections(_lines, null, this)); // check for redirect page string firstLine = ""; if (_lines.Count > 0) { firstLine = _lines[0]; } string redirect = "#redirect"; if (firstLine.Length >= redirect.Length && firstLine.Substring(0, redirect.Length).ToLower() == redirect && WikiLinks.Count == 1 && followRedirection) { // get redirect page string redirectURL = WikiLinks[0].DestPageURL; _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection); } // process markup WikiMarkup.ProcessMarkup(_lines); // set line information for the page SetLineInfo(); // get TF information foreach (Section s in _sections) { foreach (string line in s.Lines) { string[] tokens = line.Split(' '); foreach (string token in tokens) { // ignore case string lowerToken = token.ToLower().Trim(); lowerToken = WikiMarkup.TrimPunctuation(lowerToken); if (lowerToken == "" || WikiMarkup.IsStopWord(lowerToken, false)) { continue; } if (!_termFrequencies.ContainsKey(lowerToken)) { _termFrequencies[lowerToken] = 1.0F; } else { _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1; } } } } }