/// <summary> /// Constructor /// </summary> /// <param name="title">Title of page</param> /// <param name="ns">Namespace of this page</param> /// <param name="ID">ID for this page</param> /// <param name="lines">Lines contained in this page</param> /// <param name="sectionLayout">Layout of sections (see section class for documentation)</param> public Page(string title, WikiDB.Namespace ns, int ID, List <string> lines, string sectionLayout) : this(title, ns, ID) { _lines = lines; _sections = Section.ExtractSections(lines, null, this, sectionLayout); _sectionsRecovered = true; }
/// <summary> /// Constructor /// </summary> /// <param name="title">Title of page</param> /// <param name="ns">Namespace of this page</param> /// <param name="ID">ID for this page</param> public Page(string title, WikiDB.Namespace ns, int ID) : this() { _title = title; _namespace = ns; _ID = ID; }
/// <summary> /// Get titles containing a word /// </summary> /// <param name="ns">Namespace to search</param> /// <param name="s">Word to look for</param> /// <returns>List of page titles (strings) containing word</returns> public override List <string> GetTitlesContaining(WikiDB.Namespace ns, string s) { if (!Connected) { throw new Exception("Not connected to mirror DB"); } s = MySQLEscape(s); List <string> titles = new List <string>(); if (s == "") { return(titles); } int nsVal = NamespaceValue(ns); IDataReader reader = SubmitQuery("SELECT page_title FROM titleindex " + "WHERE word=\"" + s + "\" AND page_namespace=" + nsVal); while (reader.Read()) { titles.Add(reader["page_title"].ToString()); } reader.Close(); return(titles); }
/// <summary> /// Constructor /// </summary> /// <param name="rawLink">Raw text for link</param> /// <param name="pageTitle">Title of page being linked to</param> /// <param name="pageSection">Section of page being linked to</param> /// <param name="displayText">Display text for link</param> /// <param name="sourceSection">Section of source page that contains this link</param> public WikiLink(string rawLink, string pageTitle, string pageSection, string displayText, Section sourceSection) { _rawLink = rawLink; _destPageTitle = pageTitle; _destPageSection = pageSection; _displayText = displayText != "" ? displayText : (pageTitle + (_destPageSection != "" ? "#" + _destPageSection : "")); _displayText = WikiMarkup.ProcessMarkup(_displayText); _destPageNamespace = WikiDB.Namespace.Main; _sourcePage = sourceSection.ContainingPage; _sourceSection = sourceSection; _weight = 0; }
/// <summary> /// Write page index table /// </summary> /// <param name="startID">Where to start writing</param> /// <param name="numPages">Total number of pages to write</param> /// <param name="blockSize">Number of pages to read at a time</param> /// <param name="ns">Namespace to read/write from/to</param> /// <param name="includeRedirects">Whether or not to include redirect pages</param> /// <returns>Last title written</returns> public string WritePageIndex(int startID, int numPages, int blockSize, WikiDB.Namespace ns, bool includeRedirects) { if (_mainDB == null) { throw new Exception("Main DB not connected. Use the right constructor."); } if (!_mainDB.CheckConnection(true)) { throw new Exception("Not connected to main DB"); } if (!CheckConnection(true)) { throw new Exception("Not connected to mirror DB"); } _stopWriting = false; int end = startID + numPages - 1; int i; // check block size if (blockSize > numPages) { blockSize = numPages; } for (i = startID; i <= end; i += blockSize) { if (_stopWriting) { break; } // get titles List <string> titles = null; try { titles = _mainDB.GetTitleRange(ns, i, blockSize, includeRedirects); WritePageIndex(ns, titles); } catch (Exception ex) { throw new Exception("Failed at start ID " + i + ". Error: " + ex); } } return(_lastTitleWritten); }
/// <summary> /// Gets whether or not a page is a redirect page /// </summary> /// <param name="ns">Namespace to search</param> /// <param name="title">Title to search</param> /// <returns>True if page is a redirect page, False otherwise</returns> public override bool IsRedirect(WikiDB.Namespace ns, string title) { title = MySQLEscape(GetURLFromPageTitle(title)); if (title == "") { return(false); } int nsVal = NamespaceValue(ns); IDataReader reader = SubmitQuery("SELECT redirects_to FROM page " + "WHERE page_namespace=" + nsVal + " AND page_title=\"" + title + "\""); if (reader.Read()) { string redirectsTo = reader["redirects_to"].ToString(); reader.Close(); return(redirectsTo != ""); } else { reader.Close(); return(false); } }
/// <summary> /// Constructor /// </summary> /// <param name="title">Title of page</param> /// <param name="ns">Namespace of this page</param> /// <param name="ID">ID for this page</param> /// <param name="wikiText">Wiki text for page</param> /// <param name="database">Database containing this page</param> /// <param name="followRedirection">Whether or not to follow redirection</param> public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection) : this(title, ns, ID) { // remove irrelevant markup wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText); // split page up into lines _lines = Section.GetLines(wikiText); int firstSectionStart = Section.GetNextSectionStart(_lines, 0); List <string> headerLines = Section.ExtractLines(_lines, 0, firstSectionStart - 1); if (headerLines.Count > 0) { Header h = new Header(headerLines, this); _sections.Add(h); } // get sections _sections.AddRange(Section.ExtractSections(_lines, null, this)); // check for redirect page string firstLine = ""; if (_lines.Count > 0) { firstLine = _lines[0]; } string redirect = "#redirect"; if (firstLine.Length >= redirect.Length && firstLine.Substring(0, redirect.Length).ToLower() == redirect && WikiLinks.Count == 1 && followRedirection) { // get redirect page string redirectURL = WikiLinks[0].DestPageURL; _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection); } // process markup WikiMarkup.ProcessMarkup(_lines); // set line information for the page SetLineInfo(); // get TF information foreach (Section s in _sections) { foreach (string line in s.Lines) { string[] tokens = line.Split(' '); foreach (string token in tokens) { // ignore case string lowerToken = token.ToLower().Trim(); lowerToken = WikiMarkup.TrimPunctuation(lowerToken); if (lowerToken == "" || WikiMarkup.IsStopWord(lowerToken, false)) { continue; } if (!_termFrequencies.ContainsKey(lowerToken)) { _termFrequencies[lowerToken] = 1.0F; } else { _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1; } } } } }
/// <summary> /// Dumps a range of pages to a file /// </summary> /// <param name="startID">Where to start dumping</param> /// <param name="numPages">Total number of pages to dump</param> /// <param name="blockSize">Number of pages to read at a time</param> /// <param name="ns">Namespace to read/write from/to</param> /// <param name="dumpDir">Directory to save dump files in</param> /// <param name="includeRedirects">Whether or not to dump redirect pages</param> /// <returns>Last title dumped</returns> public string DumpToLemur(int startID, int numPages, int blockSize, WikiDB.Namespace ns, string dumpDir, bool includeRedirects) { if (_mainDB == null) { throw new Exception("Main DB not connected. Use the right constructor."); } if (!_mainDB.Connected) { throw new Exception("Not connected to main DB"); } if (!Connected) { throw new Exception("Not connected to mirror DB"); } if (dumpDir[dumpDir.Length - 1] != '\\') { dumpDir += @"\"; } _stopWriting = false; string dumpFile = dumpDir + "dump_" + DateTime.Now.Ticks + ".xml"; StreamWriter dumpWriter = new StreamWriter(dumpFile); dumpWriter.AutoFlush = true; // bytes in a MB int mb = 1024 * 1024; // file size limit in MB float mbLimit = 200; int end = startID + numPages - 1; int i; for (i = startID; i <= end; i += blockSize) { if (_stopWriting) { break; } // check file size, start another one if needed FileInfo fi = new FileInfo(dumpFile); float sizeMB = fi.Length / (float)mb; if (sizeMB > mbLimit) { dumpWriter.Close(); dumpFile = dumpDir + "dump_" + DateTime.Now.Ticks + ".xml"; dumpWriter = new StreamWriter(dumpFile); } // get titles List <string> titles = null; try { titles = GetTitleRange(ns, i, blockSize, includeRedirects); } catch (Exception ex) { dumpWriter.Close(); throw new Exception("Failed at start ID " + i + ". Error: " + ex); } // process titles foreach (string title in titles) { if (_stopWriting) { break; } try { // get page from mirror and dump to file Page p = LookupPage(ns, title, false, false, false, false); if (p == null) { StreamWriter logWriter = new StreamWriter("lemur_dump_log.txt", true); logWriter.WriteLine("Dump to Lemur: could not find page to write: " + title); logWriter.Close(); } else { dumpWriter.WriteLine(p.LemurDump); } _lastTitleWritten = p.Title; _lastIDWritten = p.ID; } catch (Exception ex) { dumpWriter.Close(); throw new Exception("Failed at start ID " + i + ", title \"" + title + "\". Error: " + ex); } } } dumpWriter.Close(); return(_lastTitleWritten); }
/// <summary> /// Looks up page in database /// </summary> /// <param name="ns">Namespace to look page up in</param> /// <param name="title">Title of page to look up</param> /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param> /// <param name="recoverSections">Whether or not to recover the section structure of the page</param> /// <param name="readLinks">Read link information</param> /// <param name="readTFTable">Read term frequency table</param> /// <returns>Page instance</returns> public Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection, bool recoverSections, bool readLinks, bool readTFTable) { string url = GetURLFromPageTitle(title); // check page cache if (_pageCache.ContainsKey(url)) { // see if page is dirty Page cached = (Page)_pageCache[url]; bool dirty = (recoverSections && !cached.SectionsRecovered) || (readLinks && cached.WikiLinks.Count == 0) || (readTFTable && cached.TermFrequencies.Count == 0); if (!dirty) { return(cached); } else { _pageCache.Remove(url); } } if (!CheckConnection(true)) { throw new Exception("Could not establish connection with Wikipedia database"); } Page p = null; url = MySQLEscape(url); int nsVal = NamespaceValue(ns); // get text and redirect page string selectCols = "page_id, page_text, redirects_to"; if (recoverSections) { selectCols += ", section_layout"; } string query = "SELECT " + selectCols + " FROM page " + "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; IDataReader reader = SubmitQuery(query); if (reader.Read()) { int id = int.Parse(reader["page_id"].ToString()); string text = Encoding.UTF8.GetString((byte[])reader["page_text"]); string redirectsTo = reader["redirects_to"].ToString(); // split into lines List <string> lines = new List <string>(text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries)); if (recoverSections) { string sectionLayout = reader["section_layout"].ToString(); p = new Page(title, ns, id, lines, sectionLayout); } else { p = new Page(title, ns, id); p.Lines = lines; // add a single section to the page Section s = new Section("full page section", lines, null, p, 0, lines.Count - 1); p.Sections.Add(s); } reader.Close(); // check for page redirection Page redirectPage = null; if (redirectsTo != "" && followRedirection) { redirectPage = LookupPage(ns, redirectsTo, followRedirection, recoverSections, readLinks, readTFTable); } p.RedirectsTo = redirectPage; } else { reader.Close(); return(null); } reader.Close(); // get links if (readLinks) { query = "SELECT link_list FROM pagelinks " + "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; reader = SubmitQuery(query); if (reader.Read()) { string linkList = Encoding.UTF8.GetString((byte[])reader["link_list"]); string[] splitList = linkList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); foreach (string link in splitList) { string[] entry = link.Split(' '); if (entry.Length != 2) { throw new Exception("Invalid link entry"); } string destPage = entry[0]; string sourceSectionID = recoverSections ? entry[1] : "1"; Section s = p.GetSection(sourceSectionID); WikiLink wl = new WikiLink("[[" + destPage + "]]", destPage, "", destPage, s); s.AddLink(wl); } } reader.Close(); } // get TFs if (readTFTable) { query = "SELECT freq_list FROM termfreqs WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\""; reader = SubmitQuery(query); if (reader.Read()) { string freqList = Encoding.UTF8.GetString((byte[])reader["freq_list"]); string[] splitList = freqList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); // get freqs for (int i = 0; i < splitList.Length; ++i) { string[] entry = splitList[i].Split(' '); if (entry.Length != 2) { throw new Exception("Invalid frequency entry"); } string word = entry[0]; float freq = float.Parse(entry[1]); if (p.TermFrequencies.ContainsKey(word)) { throw new Exception("Duplicate TF entry"); } p.TermFrequencies[word] = freq; } } reader.Close(); } // add page to cache CachePage(p); return(p); }
/// <summary> /// Looks up a page in the database (recovers sections) /// </summary> /// <param name="ns">Namespace to look page up in</param> /// <param name="title">Title of page to look up</param> /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param> /// <returns>Page instance</returns> public override Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection) { return(LookupPage(ns, title, followRedirection, true, true, true)); }