Ejemplo n.º 1
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="title">Title of page</param>
        /// <param name="ns">Namespace of this page</param>
        /// <param name="ID">ID for this page</param>
        /// <param name="wikiText">Wiki text for page</param>
        /// <param name="database">Database containing this page</param>
        /// <param name="followRedirection">Whether or not to follow redirection</param>
        public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection)
            : this(title, ns, ID)
        {
            // remove irrelevant markup
            wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText);

            // split page up into lines
            _lines = Section.GetLines(wikiText);

            int           firstSectionStart = Section.GetNextSectionStart(_lines, 0);
            List <string> headerLines       = Section.ExtractLines(_lines, 0, firstSectionStart - 1);

            if (headerLines.Count > 0)
            {
                Header h = new Header(headerLines, this);
                _sections.Add(h);
            }

            // get sections
            _sections.AddRange(Section.ExtractSections(_lines, null, this));

            // check for redirect page
            string firstLine = "";

            if (_lines.Count > 0)
            {
                firstLine = _lines[0];
            }
            string redirect = "#redirect";

            if (firstLine.Length >= redirect.Length &&
                firstLine.Substring(0, redirect.Length).ToLower() == redirect &&
                WikiLinks.Count == 1 &&
                followRedirection)
            {
                // get redirect page
                string redirectURL = WikiLinks[0].DestPageURL;
                _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection);
            }

            // process markup
            WikiMarkup.ProcessMarkup(_lines);

            // set line information for the page
            SetLineInfo();

            // get TF information
            foreach (Section s in _sections)
            {
                foreach (string line in s.Lines)
                {
                    string[] tokens = line.Split(' ');

                    foreach (string token in tokens)
                    {
                        // ignore case
                        string lowerToken = token.ToLower().Trim();
                        lowerToken = WikiMarkup.TrimPunctuation(lowerToken);

                        if (lowerToken == "" ||
                            WikiMarkup.IsStopWord(lowerToken, false))
                        {
                            continue;
                        }

                        if (!_termFrequencies.ContainsKey(lowerToken))
                        {
                            _termFrequencies[lowerToken] = 1.0F;
                        }
                        else
                        {
                            _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1;
                        }
                    }
                }
            }
        }