Exemple #1
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="title">Title of page</param>
 /// <param name="ns">Namespace of this page</param>
 /// <param name="ID">ID for this page</param>
 /// <param name="lines">Lines contained in this page</param>
 /// <param name="sectionLayout">Layout of sections (see section class for documentation)</param>
 public Page(string title, WikiDB.Namespace ns, int ID, List <string> lines, string sectionLayout)
     : this(title, ns, ID)
 {
     _lines             = lines;
     _sections          = Section.ExtractSections(lines, null, this, sectionLayout);
     _sectionsRecovered = true;
 }
Exemple #2
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="title">Title of page</param>
 /// <param name="ns">Namespace of this page</param>
 /// <param name="ID">ID for this page</param>
 public Page(string title, WikiDB.Namespace ns, int ID)
     : this()
 {
     _title     = title;
     _namespace = ns;
     _ID        = ID;
 }
Exemple #3
0
        /// <summary>
        /// Get titles containing a word
        /// </summary>
        /// <param name="ns">Namespace to search</param>
        /// <param name="s">Word to look for</param>
        /// <returns>List of page titles (strings) containing word</returns>
        public override List <string> GetTitlesContaining(WikiDB.Namespace ns, string s)
        {
            if (!Connected)
            {
                throw new Exception("Not connected to mirror DB");
            }

            s = MySQLEscape(s);
            List <string> titles = new List <string>();

            if (s == "")
            {
                return(titles);
            }

            int         nsVal  = NamespaceValue(ns);
            IDataReader reader = SubmitQuery("SELECT page_title FROM titleindex " +
                                             "WHERE word=\"" + s + "\" AND page_namespace=" + nsVal);

            while (reader.Read())
            {
                titles.Add(reader["page_title"].ToString());
            }
            reader.Close();

            return(titles);
        }
Exemple #4
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="rawLink">Raw text for link</param>
 /// <param name="pageTitle">Title of page being linked to</param>
 /// <param name="pageSection">Section of page being linked to</param>
 /// <param name="displayText">Display text for link</param>
 /// <param name="sourceSection">Section of source page that contains this link</param>
 public WikiLink(string rawLink, string pageTitle, string pageSection, string displayText, Section sourceSection)
 {
     _rawLink           = rawLink;
     _destPageTitle     = pageTitle;
     _destPageSection   = pageSection;
     _displayText       = displayText != "" ? displayText : (pageTitle + (_destPageSection != "" ? "#" + _destPageSection : ""));
     _displayText       = WikiMarkup.ProcessMarkup(_displayText);
     _destPageNamespace = WikiDB.Namespace.Main;
     _sourcePage        = sourceSection.ContainingPage;
     _sourceSection     = sourceSection;
     _weight            = 0;
 }
Exemple #5
0
        /// <summary>
        /// Write page index table
        /// </summary>
        /// <param name="startID">Where to start writing</param>
        /// <param name="numPages">Total number of pages to write</param>
        /// <param name="blockSize">Number of pages to read at a time</param>
        /// <param name="ns">Namespace to read/write from/to</param>
        /// <param name="includeRedirects">Whether or not to include redirect pages</param>
        /// <returns>Last title written</returns>
        public string WritePageIndex(int startID, int numPages, int blockSize, WikiDB.Namespace ns, bool includeRedirects)
        {
            if (_mainDB == null)
            {
                throw new Exception("Main DB not connected.  Use the right constructor.");
            }

            if (!_mainDB.CheckConnection(true))
            {
                throw new Exception("Not connected to main DB");
            }

            if (!CheckConnection(true))
            {
                throw new Exception("Not connected to mirror DB");
            }

            _stopWriting = false;

            int end = startID + numPages - 1;
            int i;

            // check block size
            if (blockSize > numPages)
            {
                blockSize = numPages;
            }

            for (i = startID; i <= end; i += blockSize)
            {
                if (_stopWriting)
                {
                    break;
                }

                // get titles
                List <string> titles = null;
                try
                {
                    titles = _mainDB.GetTitleRange(ns, i, blockSize, includeRedirects);
                    WritePageIndex(ns, titles);
                }
                catch (Exception ex)
                {
                    throw new Exception("Failed at start ID " + i + ".  Error:  " + ex);
                }
            }

            return(_lastTitleWritten);
        }
Exemple #6
0
        /// <summary>
        /// Gets whether or not a page is a redirect page
        /// </summary>
        /// <param name="ns">Namespace to search</param>
        /// <param name="title">Title to search</param>
        /// <returns>True if page is a redirect page, False otherwise</returns>
        public override bool IsRedirect(WikiDB.Namespace ns, string title)
        {
            title = MySQLEscape(GetURLFromPageTitle(title));
            if (title == "")
            {
                return(false);
            }

            int         nsVal  = NamespaceValue(ns);
            IDataReader reader = SubmitQuery("SELECT redirects_to FROM page " +
                                             "WHERE page_namespace=" + nsVal + " AND page_title=\"" + title + "\"");

            if (reader.Read())
            {
                string redirectsTo = reader["redirects_to"].ToString();
                reader.Close();
                return(redirectsTo != "");
            }
            else
            {
                reader.Close();
                return(false);
            }
        }
Exemple #7
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="title">Title of page</param>
        /// <param name="ns">Namespace of this page</param>
        /// <param name="ID">ID for this page</param>
        /// <param name="wikiText">Wiki text for page</param>
        /// <param name="database">Database containing this page</param>
        /// <param name="followRedirection">Whether or not to follow redirection</param>
        public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection)
            : this(title, ns, ID)
        {
            // remove irrelevant markup
            wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText);

            // split page up into lines
            _lines = Section.GetLines(wikiText);

            int           firstSectionStart = Section.GetNextSectionStart(_lines, 0);
            List <string> headerLines       = Section.ExtractLines(_lines, 0, firstSectionStart - 1);

            if (headerLines.Count > 0)
            {
                Header h = new Header(headerLines, this);
                _sections.Add(h);
            }

            // get sections
            _sections.AddRange(Section.ExtractSections(_lines, null, this));

            // check for redirect page
            string firstLine = "";

            if (_lines.Count > 0)
            {
                firstLine = _lines[0];
            }
            string redirect = "#redirect";

            if (firstLine.Length >= redirect.Length &&
                firstLine.Substring(0, redirect.Length).ToLower() == redirect &&
                WikiLinks.Count == 1 &&
                followRedirection)
            {
                // get redirect page
                string redirectURL = WikiLinks[0].DestPageURL;
                _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection);
            }

            // process markup
            WikiMarkup.ProcessMarkup(_lines);

            // set line information for the page
            SetLineInfo();

            // get TF information
            foreach (Section s in _sections)
            {
                foreach (string line in s.Lines)
                {
                    string[] tokens = line.Split(' ');

                    foreach (string token in tokens)
                    {
                        // ignore case
                        string lowerToken = token.ToLower().Trim();
                        lowerToken = WikiMarkup.TrimPunctuation(lowerToken);

                        if (lowerToken == "" ||
                            WikiMarkup.IsStopWord(lowerToken, false))
                        {
                            continue;
                        }

                        if (!_termFrequencies.ContainsKey(lowerToken))
                        {
                            _termFrequencies[lowerToken] = 1.0F;
                        }
                        else
                        {
                            _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1;
                        }
                    }
                }
            }
        }
Exemple #8
0
        /// <summary>
        /// Dumps a range of pages to a file
        /// </summary>
        /// <param name="startID">Where to start dumping</param>
        /// <param name="numPages">Total number of pages to dump</param>
        /// <param name="blockSize">Number of pages to read at a time</param>
        /// <param name="ns">Namespace to read/write from/to</param>
        /// <param name="dumpDir">Directory to save dump files in</param>
        /// <param name="includeRedirects">Whether or not to dump redirect pages</param>
        /// <returns>Last title dumped</returns>
        public string DumpToLemur(int startID, int numPages, int blockSize, WikiDB.Namespace ns, string dumpDir, bool includeRedirects)
        {
            if (_mainDB == null)
            {
                throw new Exception("Main DB not connected.  Use the right constructor.");
            }

            if (!_mainDB.Connected)
            {
                throw new Exception("Not connected to main DB");
            }

            if (!Connected)
            {
                throw new Exception("Not connected to mirror DB");
            }

            if (dumpDir[dumpDir.Length - 1] != '\\')
            {
                dumpDir += @"\";
            }

            _stopWriting = false;
            string       dumpFile   = dumpDir + "dump_" + DateTime.Now.Ticks + ".xml";
            StreamWriter dumpWriter = new StreamWriter(dumpFile);

            dumpWriter.AutoFlush = true;

            // bytes in a MB
            int mb = 1024 * 1024;

            // file size limit in MB
            float mbLimit = 200;

            int end = startID + numPages - 1;
            int i;

            for (i = startID; i <= end; i += blockSize)
            {
                if (_stopWriting)
                {
                    break;
                }

                // check file size, start another one if needed
                FileInfo fi     = new FileInfo(dumpFile);
                float    sizeMB = fi.Length / (float)mb;
                if (sizeMB > mbLimit)
                {
                    dumpWriter.Close();
                    dumpFile   = dumpDir + "dump_" + DateTime.Now.Ticks + ".xml";
                    dumpWriter = new StreamWriter(dumpFile);
                }

                // get titles
                List <string> titles = null;
                try
                {
                    titles = GetTitleRange(ns, i, blockSize, includeRedirects);
                }
                catch (Exception ex)
                {
                    dumpWriter.Close();
                    throw new Exception("Failed at start ID " + i + ".  Error:  " + ex);
                }

                // process titles
                foreach (string title in titles)
                {
                    if (_stopWriting)
                    {
                        break;
                    }

                    try
                    {
                        // get page from mirror and dump to file
                        Page p = LookupPage(ns, title, false, false, false, false);
                        if (p == null)
                        {
                            StreamWriter logWriter = new StreamWriter("lemur_dump_log.txt", true);
                            logWriter.WriteLine("Dump to Lemur:  could not find page to write:  " + title);
                            logWriter.Close();
                        }
                        else
                        {
                            dumpWriter.WriteLine(p.LemurDump);
                        }

                        _lastTitleWritten = p.Title;
                        _lastIDWritten    = p.ID;
                    }
                    catch (Exception ex)
                    {
                        dumpWriter.Close();
                        throw new Exception("Failed at start ID " + i + ", title \"" + title + "\".  Error: " + ex);
                    }
                }
            }

            dumpWriter.Close();
            return(_lastTitleWritten);
        }
Exemple #9
0
        /// <summary>
        /// Looks up page in database
        /// </summary>
        /// <param name="ns">Namespace to look page up in</param>
        /// <param name="title">Title of page to look up</param>
        /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param>
        /// <param name="recoverSections">Whether or not to recover the section structure of the page</param>
        /// <param name="readLinks">Read link information</param>
        /// <param name="readTFTable">Read term frequency table</param>
        /// <returns>Page instance</returns>
        public Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection,
                               bool recoverSections, bool readLinks, bool readTFTable)
        {
            string url = GetURLFromPageTitle(title);

            // check page cache
            if (_pageCache.ContainsKey(url))
            {
                // see if page is dirty
                Page cached = (Page)_pageCache[url];
                bool dirty  = (recoverSections && !cached.SectionsRecovered) ||
                              (readLinks && cached.WikiLinks.Count == 0) ||
                              (readTFTable && cached.TermFrequencies.Count == 0);
                if (!dirty)
                {
                    return(cached);
                }
                else
                {
                    _pageCache.Remove(url);
                }
            }

            if (!CheckConnection(true))
            {
                throw new Exception("Could not establish connection with Wikipedia database");
            }

            Page p = null;

            url = MySQLEscape(url);
            int nsVal = NamespaceValue(ns);

            // get text and redirect page
            string selectCols = "page_id, page_text, redirects_to";

            if (recoverSections)
            {
                selectCols += ", section_layout";
            }

            string query = "SELECT " + selectCols + " FROM page " +
                           "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
            IDataReader reader = SubmitQuery(query);

            if (reader.Read())
            {
                int    id          = int.Parse(reader["page_id"].ToString());
                string text        = Encoding.UTF8.GetString((byte[])reader["page_text"]);
                string redirectsTo = reader["redirects_to"].ToString();

                // split into lines
                List <string> lines = new List <string>(text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries));
                if (recoverSections)
                {
                    string sectionLayout = reader["section_layout"].ToString();
                    p = new Page(title, ns, id, lines, sectionLayout);
                }
                else
                {
                    p       = new Page(title, ns, id);
                    p.Lines = lines;

                    // add a single section to the page
                    Section s = new Section("full page section", lines, null, p, 0, lines.Count - 1);
                    p.Sections.Add(s);
                }
                reader.Close();

                // check for page redirection
                Page redirectPage = null;
                if (redirectsTo != "" && followRedirection)
                {
                    redirectPage = LookupPage(ns, redirectsTo, followRedirection, recoverSections, readLinks, readTFTable);
                }

                p.RedirectsTo = redirectPage;
            }
            else
            {
                reader.Close();
                return(null);
            }
            reader.Close();

            // get links
            if (readLinks)
            {
                query = "SELECT link_list FROM pagelinks " +
                        "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   linkList  = Encoding.UTF8.GetString((byte[])reader["link_list"]);
                    string[] splitList = linkList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    foreach (string link in splitList)
                    {
                        string[] entry = link.Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid link entry");
                        }

                        string   destPage        = entry[0];
                        string   sourceSectionID = recoverSections ? entry[1] : "1";
                        Section  s  = p.GetSection(sourceSectionID);
                        WikiLink wl = new WikiLink("[[" + destPage + "]]", destPage, "", destPage, s);
                        s.AddLink(wl);
                    }
                }
                reader.Close();
            }

            // get TFs
            if (readTFTable)
            {
                query  = "SELECT freq_list FROM termfreqs WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   freqList  = Encoding.UTF8.GetString((byte[])reader["freq_list"]);
                    string[] splitList = freqList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    // get freqs
                    for (int i = 0; i < splitList.Length; ++i)
                    {
                        string[] entry = splitList[i].Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid frequency entry");
                        }

                        string word = entry[0];
                        float  freq = float.Parse(entry[1]);

                        if (p.TermFrequencies.ContainsKey(word))
                        {
                            throw new Exception("Duplicate TF entry");
                        }

                        p.TermFrequencies[word] = freq;
                    }
                }
                reader.Close();
            }

            // add page to cache
            CachePage(p);

            return(p);
        }
Exemple #10
0
 /// <summary>
 /// Looks up a page in the database (recovers sections)
 /// </summary>
 /// <param name="ns">Namespace to look page up in</param>
 /// <param name="title">Title of page to look up</param>
 /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param>
 /// <returns>Page instance</returns>
 public override Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection)
 {
     return(LookupPage(ns, title, followRedirection, true, true, true));
 }