/// <summary> /// Gets WikiLinks from a section /// </summary> /// <param name="lines"></param> /// <param name="source"></param> /// <returns>List of links for section</returns> public static List <WikiLink> GetWikiLinks(List <string> lines, Section source) { List <WikiLink> wikiLinks = new List <WikiLink>(); Regex wikiLinkRE = WikiMarkup.GetWikiLinkRE(); Regex subsectionRE = WikiMarkup.GetSubsectionRE(); Regex emphasisRE = WikiMarkup.GetEmphasisRE(); foreach (string line in lines) { MatchCollection mc = wikiLinkRE.Matches(line); foreach (Match linkMatch in mc) { // create wiki link string rawLink = linkMatch.Value; string page = linkMatch.Groups["LinkDest"].ToString().Trim(); string section = linkMatch.Groups["Section"].ToString().Trim(); string displayText = linkMatch.Groups["DisplayText"].ToString().Trim(); WikiLink link = new WikiLink(rawLink, page, section, displayText, source); wikiLinks.Add(link); } } wikiLinks.Sort(); return(wikiLinks); }
/// <summary> /// Finds the end of a section. /// </summary> /// <param name="lines">Lines to search</param> /// <param name="start">Where to start search. This must be a valid section line (e.g., ==Section X==)</param> /// <returns>Number of last line in the section</returns> public static int GetSectionEnd(List <string> lines, int start) { Regex r = WikiMarkup.GetSubsectionRE(); // get current depth Match m = r.Match(lines[start]); if (!m.Success) { throw new Exception("Invalid starting line for section"); } int depth = m.Groups["SectionStart"].Length; // find end of section int subsectionEnd; for (subsectionEnd = start + 1; subsectionEnd < lines.Count; ++subsectionEnd) { string line = lines[subsectionEnd]; m = r.Match(line); if (m.Success) { int depthSub = m.Groups["SectionStart"].Length; if (depthSub <= depth) { break; } } } return(subsectionEnd - 1); }
/// <summary> /// Removes markup from this page /// </summary> public void RemoveMarkup() { WikiMarkup.ProcessMarkup(_lines); foreach (Section s in _sections) { s.RemoveMarkup(); } }
/// <summary> /// Writes the titleindex table /// </summary> /// <param name="ns">Namespace to read/write from/to</param> /// <param name="titles">List of titles to write</param> /// <returns>Last title written</returns> public string WriteTitleIndex(Namespace ns, List <string> titles) { int nsVal = NamespaceValue(ns); // process titles for (int j = 0; j < titles.Count; ++j) { string title = titles[j]; if (title.Trim() == "") { continue; } string[] words = title.Split(new char[] { '_', '\\', '/' }, StringSplitOptions.RemoveEmptyEntries); // process words in title for (int k = 0; k < words.Length; ++k) { string word = words[k]; word = word.Trim().ToLower(); word = WikiMarkup.TrimPunctuation(word); if (WikiMarkup.IsStopWord(word, true)) { continue; } word = MySQLEscape(word); title = MySQLEscape(title); try { IDataReader reader = SubmitQuery("SELECT word FROM titleindex " + "WHERE word=\"" + word + "\" AND page_namespace=" + nsVal + " AND page_title=\"" + title + "\""); if (!reader.Read()) { reader.Close(); ExecuteNonQuery("INSERT INTO titleindex (word, page_namespace, page_title) " + "VALUES (\"" + word + "\", " + nsVal + ", \"" + title + "\")"); } reader.Close(); _lastTitleWritten = title; } catch (Exception ex) { throw new Exception("Failed at title " + title + ", error: " + ex); } } } return(_lastTitleWritten); }
/// <summary> /// Constructor /// </summary> /// <param name="rawLink">Raw text for link</param> /// <param name="pageTitle">Title of page being linked to</param> /// <param name="pageSection">Section of page being linked to</param> /// <param name="displayText">Display text for link</param> /// <param name="sourceSection">Section of source page that contains this link</param> public WikiLink(string rawLink, string pageTitle, string pageSection, string displayText, Section sourceSection) { _rawLink = rawLink; _destPageTitle = pageTitle; _destPageSection = pageSection; _displayText = displayText != "" ? displayText : (pageTitle + (_destPageSection != "" ? "#" + _destPageSection : "")); _displayText = WikiMarkup.ProcessMarkup(_displayText); _destPageNamespace = WikiDB.Namespace.Main; _sourcePage = sourceSection.ContainingPage; _sourceSection = sourceSection; _weight = 0; }
/// <summary> /// Gets the line number within a list of lines of the next section start /// </summary> /// <param name="lines">List of lines to search</param> /// <param name="start">Where to start search</param> /// <returns>Line number of section start</returns> public static int GetNextSectionStart(List <string> lines, int start) { Regex sectionRE = WikiMarkup.GetSubsectionRE(); int sectionStart = start; while (sectionStart < lines.Count && !sectionRE.Match(lines[sectionStart]).Success) { ++sectionStart; } return(sectionStart); }
/// <summary> /// Constructor for marked up section /// </summary> /// <param name="name">Name of section</param> /// <param name="wikiTextLines">List of lines for the page</param> /// <param name="parentSection">Parent section</param> /// <param name="containingPage">Page that contains this section</param> public Section(string name, List <string> wikiTextLines, Section parentSection, Page containingPage) : this() { if (wikiTextLines == null || wikiTextLines.Count == 0) { throw new Exception("Cannot create Section with blank text"); } _lines = wikiTextLines; _parentSection = parentSection; _name = name; _containingPage = containingPage; // include the first line if it's not a section line (the case for headers) Regex sectionRE = WikiMarkup.GetSubsectionRE(); int secLinesStart; if (sectionRE.Match(_lines[0]).Success) { secLinesStart = 1; } else { secLinesStart = 0; } List <string> subsectionLines = ExtractLines(_lines, secLinesStart, _lines.Count - secLinesStart); List <Section> subsections = ExtractSections(subsectionLines, this, _containingPage); _subSections.AddRange(subsections); // get links prior to first subsection int subStart = GetNextSectionStart(_lines, secLinesStart); List <string> preSubsectionLines = ExtractLines(_lines, 0, subStart - 1); _wikiLinks = WikiLink.GetWikiLinks(preSubsectionLines, this); // remove markup WikiMarkup.ProcessMarkup(_lines); }
/// <summary> /// Extract a section and its subsections (and their subsections, etc.) from a list of lines /// </summary> /// <param name="lines">Lines to extract from</param> /// <param name="parent">Parent section for top-level sections</param> /// <param name="containingPage">Page that contains these sections</param> /// <returns>List of Section instances</returns> public static List <Section> ExtractSections(List <string> lines, Section parent, Page containingPage) { List <Section> sections = new List <Section>(); if (lines == null || lines.Count == 0) { return(sections); } // get sections Regex subsectionRE = WikiMarkup.GetSubsectionRE(); int startLine = GetNextSectionStart(lines, 0); while (startLine < lines.Count) { // check starting line Match m = subsectionRE.Match(lines[startLine]); if (!m.Success) { throw new Exception("Invalid starting line for section"); } // get the end of this section int endLine = GetSectionEnd(lines, startLine); // get lines for this section List <string> sectionLines = ExtractLines(lines, startLine, endLine); // create section, which recursively extracts subsections string secName = m.Groups["Name"].ToString(); secName = WikiMarkup.ProcessMarkup(secName); Section s = new Section(secName, sectionLines, parent, containingPage); sections.Add(s); // get start of next section startLine = GetNextSectionStart(lines, endLine + 1); } return(sections); }
/// <summary> /// Constructor /// </summary> /// <param name="title">Title of page</param> /// <param name="ns">Namespace of this page</param> /// <param name="ID">ID for this page</param> /// <param name="wikiText">Wiki text for page</param> /// <param name="database">Database containing this page</param> /// <param name="followRedirection">Whether or not to follow redirection</param> public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection) : this(title, ns, ID) { // remove irrelevant markup wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText); // split page up into lines _lines = Section.GetLines(wikiText); int firstSectionStart = Section.GetNextSectionStart(_lines, 0); List <string> headerLines = Section.ExtractLines(_lines, 0, firstSectionStart - 1); if (headerLines.Count > 0) { Header h = new Header(headerLines, this); _sections.Add(h); } // get sections _sections.AddRange(Section.ExtractSections(_lines, null, this)); // check for redirect page string firstLine = ""; if (_lines.Count > 0) { firstLine = _lines[0]; } string redirect = "#redirect"; if (firstLine.Length >= redirect.Length && firstLine.Substring(0, redirect.Length).ToLower() == redirect && WikiLinks.Count == 1 && followRedirection) { // get redirect page string redirectURL = WikiLinks[0].DestPageURL; _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection); } // process markup WikiMarkup.ProcessMarkup(_lines); // set line information for the page SetLineInfo(); // get TF information foreach (Section s in _sections) { foreach (string line in s.Lines) { string[] tokens = line.Split(' '); foreach (string token in tokens) { // ignore case string lowerToken = token.ToLower().Trim(); lowerToken = WikiMarkup.TrimPunctuation(lowerToken); if (lowerToken == "" || WikiMarkup.IsStopWord(lowerToken, false)) { continue; } if (!_termFrequencies.ContainsKey(lowerToken)) { _termFrequencies[lowerToken] = 1.0F; } else { _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1; } } } } }