/// <summary> /// Gets WikiLinks from a section /// </summary> /// <param name="lines"></param> /// <param name="source"></param> /// <returns>List of links for section</returns> public static List <WikiLink> GetWikiLinks(List <string> lines, Section source) { List <WikiLink> wikiLinks = new List <WikiLink>(); Regex wikiLinkRE = WikiMarkup.GetWikiLinkRE(); Regex subsectionRE = WikiMarkup.GetSubsectionRE(); Regex emphasisRE = WikiMarkup.GetEmphasisRE(); foreach (string line in lines) { MatchCollection mc = wikiLinkRE.Matches(line); foreach (Match linkMatch in mc) { // create wiki link string rawLink = linkMatch.Value; string page = linkMatch.Groups["LinkDest"].ToString().Trim(); string section = linkMatch.Groups["Section"].ToString().Trim(); string displayText = linkMatch.Groups["DisplayText"].ToString().Trim(); WikiLink link = new WikiLink(rawLink, page, section, displayText, source); wikiLinks.Add(link); } } wikiLinks.Sort(); return(wikiLinks); }
/// <summary> /// CompareTo function /// </summary> /// <param name="obj">Object to compare this WikiLink to</param> /// <returns>1 if this gt obj, -1 if this lt obj, and 0 o.w.</returns> public int CompareTo(object obj) { if (!(obj is WikiLink)) { return(1); } WikiLink wl = (WikiLink)obj; int weightCmp = _weight.CompareTo(wl.Weight); int urlCmp = DestPageURL.CompareTo(wl.DestPageURL); return(weightCmp != 0 ? weightCmp : urlCmp); }
/// <summary> /// Remove duplicate links from a list of links /// </summary> /// <param name="links"></param> public static void RemoveDuplicateLinks(List <WikiLink> links) { // remove duplicate links links.Sort(); for (int i = 0; i < links.Count; ++i) { WikiLink link1 = links[i]; for (int j = i + 1; j < links.Count;) { WikiLink link2 = links[j]; if (link1 == link2) { links.RemoveAt(j); } else { break; } } } }
/// <summary> /// Constructor for marked up section /// </summary> /// <param name="name">Name of section</param> /// <param name="wikiTextLines">List of lines for the page</param> /// <param name="parentSection">Parent section</param> /// <param name="containingPage">Page that contains this section</param> public Section(string name, List <string> wikiTextLines, Section parentSection, Page containingPage) : this() { if (wikiTextLines == null || wikiTextLines.Count == 0) { throw new Exception("Cannot create Section with blank text"); } _lines = wikiTextLines; _parentSection = parentSection; _name = name; _containingPage = containingPage; // include the first line if it's not a section line (the case for headers) Regex sectionRE = WikiMarkup.GetSubsectionRE(); int secLinesStart; if (sectionRE.Match(_lines[0]).Success) { secLinesStart = 1; } else { secLinesStart = 0; } List <string> subsectionLines = ExtractLines(_lines, secLinesStart, _lines.Count - secLinesStart); List <Section> subsections = ExtractSections(subsectionLines, this, _containingPage); _subSections.AddRange(subsections); // get links prior to first subsection int subStart = GetNextSectionStart(_lines, secLinesStart); List <string> preSubsectionLines = ExtractLines(_lines, 0, subStart - 1); _wikiLinks = WikiLink.GetWikiLinks(preSubsectionLines, this); // remove markup WikiMarkup.ProcessMarkup(_lines); }
/// <summary> /// Equality check /// </summary> /// <param name="obj">Object to compare to</param> /// <returns>True if obj equals this WikiLink, False otherwise</returns> public override bool Equals(object obj) { if (!(obj is WikiLink)) { return(false); } WikiLink wl = (WikiLink)obj; if (SourcePage != wl.SourcePage || SourceSection.ID != wl.SourceSection.ID) { return(false); } // if we've loaded destination pages, compare them if (DestPage != null && wl.DestPage != null) { return(DestPage == wl.DestPage); } // otherwise, compare destination URLs return(DestPageURL == wl.DestPageURL); }
/// <summary> /// Looks up page in database /// </summary> /// <param name="ns">Namespace to look page up in</param> /// <param name="title">Title of page to look up</param> /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param> /// <param name="recoverSections">Whether or not to recover the section structure of the page</param> /// <param name="readLinks">Read link information</param> /// <param name="readTFTable">Read term frequency table</param> /// <returns>Page instance</returns> public Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection, bool recoverSections, bool readLinks, bool readTFTable) { string url = GetURLFromPageTitle(title); // check page cache if (_pageCache.ContainsKey(url)) { // see if page is dirty Page cached = (Page)_pageCache[url]; bool dirty = (recoverSections && !cached.SectionsRecovered) || (readLinks && cached.WikiLinks.Count == 0) || (readTFTable && cached.TermFrequencies.Count == 0); if (!dirty) { return(cached); } else { _pageCache.Remove(url); } } if (!CheckConnection(true)) { throw new Exception("Could not establish connection with Wikipedia database"); } Page p = null; url = MySQLEscape(url); int nsVal = NamespaceValue(ns); // get text and redirect page string selectCols = "page_id, page_text, redirects_to"; if (recoverSections) { selectCols += ", section_layout"; } string query = "SELECT " + selectCols + " FROM page " + "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; IDataReader reader = SubmitQuery(query); if (reader.Read()) { int id = int.Parse(reader["page_id"].ToString()); string text = Encoding.UTF8.GetString((byte[])reader["page_text"]); string redirectsTo = reader["redirects_to"].ToString(); // split into lines List <string> lines = new List <string>(text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries)); if (recoverSections) { string sectionLayout = reader["section_layout"].ToString(); p = new Page(title, ns, id, lines, sectionLayout); } else { p = new Page(title, ns, id); p.Lines = lines; // add a single section to the page Section s = new Section("full page section", lines, null, p, 0, lines.Count - 1); p.Sections.Add(s); } reader.Close(); // check for page redirection Page redirectPage = null; if (redirectsTo != "" && followRedirection) { redirectPage = LookupPage(ns, redirectsTo, followRedirection, recoverSections, readLinks, readTFTable); } p.RedirectsTo = redirectPage; } else { reader.Close(); return(null); } reader.Close(); // get links if (readLinks) { query = "SELECT link_list FROM pagelinks " + "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; reader = SubmitQuery(query); if (reader.Read()) { string linkList = Encoding.UTF8.GetString((byte[])reader["link_list"]); string[] splitList = linkList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); foreach (string link in splitList) { string[] entry = link.Split(' '); if (entry.Length != 2) { throw new Exception("Invalid link entry"); } string destPage = entry[0]; string sourceSectionID = recoverSections ? entry[1] : "1"; Section s = p.GetSection(sourceSectionID); WikiLink wl = new WikiLink("[[" + destPage + "]]", destPage, "", destPage, s); s.AddLink(wl); } } reader.Close(); } // get TFs if (readTFTable) { query = "SELECT freq_list FROM termfreqs WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; reader = SubmitQuery(query); if (reader.Read()) { string freqList = Encoding.UTF8.GetString((byte[])reader["freq_list"]); string[] splitList = freqList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); // get freqs for (int i = 0; i < splitList.Length; ++i) { string[] entry = splitList[i].Split(' '); if (entry.Length != 2) { throw new Exception("Invalid frequency entry"); } string word = entry[0]; float freq = float.Parse(entry[1]); if (p.TermFrequencies.ContainsKey(word)) { throw new Exception("Duplicate TF entry"); } p.TermFrequencies[word] = freq; } } reader.Close(); } // add page to cache CachePage(p); return(p); }
/// <summary> /// Adds a link to this section /// </summary> /// <param name="wl"></param> public void AddLink(WikiLink wl) { _wikiLinks.Add(wl); }