Ejemplo n.º 1
0
        /// <summary>
        /// Looks up page in database
        /// </summary>
        /// <param name="ns">Namespace to look page up in</param>
        /// <param name="title">Title of page to look up</param>
        /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param>
        /// <param name="recoverSections">Whether or not to recover the section structure of the page</param>
        /// <param name="readLinks">Read link information</param>
        /// <param name="readTFTable">Read term frequency table</param>
        /// <returns>Page instance</returns>
        public Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection,
                               bool recoverSections, bool readLinks, bool readTFTable)
        {
            string url = GetURLFromPageTitle(title);

            // check page cache
            if (_pageCache.ContainsKey(url))
            {
                // see if page is dirty
                Page cached = (Page)_pageCache[url];
                bool dirty  = (recoverSections && !cached.SectionsRecovered) ||
                              (readLinks && cached.WikiLinks.Count == 0) ||
                              (readTFTable && cached.TermFrequencies.Count == 0);
                if (!dirty)
                {
                    return(cached);
                }
                else
                {
                    _pageCache.Remove(url);
                }
            }

            if (!CheckConnection(true))
            {
                throw new Exception("Could not establish connection with Wikipedia database");
            }

            Page p = null;

            url = MySQLEscape(url);
            int nsVal = NamespaceValue(ns);

            // get text and redirect page
            string selectCols = "page_id, page_text, redirects_to";

            if (recoverSections)
            {
                selectCols += ", section_layout";
            }

            string query = "SELECT " + selectCols + " FROM page " +
                           "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
            IDataReader reader = SubmitQuery(query);

            if (reader.Read())
            {
                int    id          = int.Parse(reader["page_id"].ToString());
                string text        = Encoding.UTF8.GetString((byte[])reader["page_text"]);
                string redirectsTo = reader["redirects_to"].ToString();

                // split into lines
                List <string> lines = new List <string>(text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries));
                if (recoverSections)
                {
                    string sectionLayout = reader["section_layout"].ToString();
                    p = new Page(title, ns, id, lines, sectionLayout);
                }
                else
                {
                    p       = new Page(title, ns, id);
                    p.Lines = lines;

                    // add a single section to the page
                    Section s = new Section("full page section", lines, null, p, 0, lines.Count - 1);
                    p.Sections.Add(s);
                }
                reader.Close();

                // check for page redirection
                Page redirectPage = null;
                if (redirectsTo != "" && followRedirection)
                {
                    redirectPage = LookupPage(ns, redirectsTo, followRedirection, recoverSections, readLinks, readTFTable);
                }

                p.RedirectsTo = redirectPage;
            }
            else
            {
                reader.Close();
                return(null);
            }
            reader.Close();

            // get links
            if (readLinks)
            {
                query = "SELECT link_list FROM pagelinks " +
                        "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   linkList  = Encoding.UTF8.GetString((byte[])reader["link_list"]);
                    string[] splitList = linkList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    foreach (string link in splitList)
                    {
                        string[] entry = link.Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid link entry");
                        }

                        string   destPage        = entry[0];
                        string   sourceSectionID = recoverSections ? entry[1] : "1";
                        Section  s  = p.GetSection(sourceSectionID);
                        WikiLink wl = new WikiLink("[[" + destPage + "]]", destPage, "", destPage, s);
                        s.AddLink(wl);
                    }
                }
                reader.Close();
            }

            // get TFs
            if (readTFTable)
            {
                query  = "SELECT freq_list FROM termfreqs WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   freqList  = Encoding.UTF8.GetString((byte[])reader["freq_list"]);
                    string[] splitList = freqList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    // get freqs
                    for (int i = 0; i < splitList.Length; ++i)
                    {
                        string[] entry = splitList[i].Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid frequency entry");
                        }

                        string word = entry[0];
                        float  freq = float.Parse(entry[1]);

                        if (p.TermFrequencies.ContainsKey(word))
                        {
                            throw new Exception("Duplicate TF entry");
                        }

                        p.TermFrequencies[word] = freq;
                    }
                }
                reader.Close();
            }

            // add page to cache
            CachePage(p);

            return(p);
        }