예제 #1
0
        /// <summary>
        /// Extract a section and its subsections (and their subsections, etc.) from a list of lines
        /// </summary>
        /// <param name="lines">Lines to extract from</param>
        /// <param name="parent">Parent section for top-level sections</param>
        /// <param name="containingPage">Page that contains these sections</param>
        /// <param name="sectionLayout">Layout of sections, as produced by Section.Layout.</param>
        /// <returns>List of Section instances</returns>
        public static List <Section> ExtractSections(List <string> lines, Section parent, Page containingPage, string sectionLayout)
        {
            List <Section> sections = new List <Section>();

            XmlParser p = new XmlParser(sectionLayout);

            while (true)
            {
                string def = p.OuterXML("section");
                if (def == null)
                {
                    break;
                }

                XmlParser secParser   = new XmlParser(def);
                int       parentStart = parent != null ? parent.SectionStart : 0;

                int absStart = int.Parse(secParser.AttributeValue("section", "start"));
                int absEnd   = int.Parse(secParser.AttributeValue("section", "end"));
                int numLines = absEnd - absStart + 1;

                int start = absStart - parentStart;
                int end   = start + numLines - 1;

                List <string> secLines = Section.ExtractLines(lines, start, end);
                sections.Add(new Section(secLines, def, parent, containingPage));
            }

            return(sections);
        }
예제 #2
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="title">Title of page</param>
        /// <param name="ns">Namespace of this page</param>
        /// <param name="ID">ID for this page</param>
        /// <param name="wikiText">Wiki text for page</param>
        /// <param name="database">Database containing this page</param>
        /// <param name="followRedirection">Whether or not to follow redirection</param>
        public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection)
            : this(title, ns, ID)
        {
            // remove irrelevant markup
            wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText);

            // split page up into lines
            _lines = Section.GetLines(wikiText);

            int           firstSectionStart = Section.GetNextSectionStart(_lines, 0);
            List <string> headerLines       = Section.ExtractLines(_lines, 0, firstSectionStart - 1);

            if (headerLines.Count > 0)
            {
                Header h = new Header(headerLines, this);
                _sections.Add(h);
            }

            // get sections
            _sections.AddRange(Section.ExtractSections(_lines, null, this));

            // check for redirect page
            string firstLine = "";

            if (_lines.Count > 0)
            {
                firstLine = _lines[0];
            }
            string redirect = "#redirect";

            if (firstLine.Length >= redirect.Length &&
                firstLine.Substring(0, redirect.Length).ToLower() == redirect &&
                WikiLinks.Count == 1 &&
                followRedirection)
            {
                // get redirect page
                string redirectURL = WikiLinks[0].DestPageURL;
                _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection);
            }

            // process markup
            WikiMarkup.ProcessMarkup(_lines);

            // set line information for the page
            SetLineInfo();

            // get TF information
            foreach (Section s in _sections)
            {
                foreach (string line in s.Lines)
                {
                    string[] tokens = line.Split(' ');

                    foreach (string token in tokens)
                    {
                        // ignore case
                        string lowerToken = token.ToLower().Trim();
                        lowerToken = WikiMarkup.TrimPunctuation(lowerToken);

                        if (lowerToken == "" ||
                            WikiMarkup.IsStopWord(lowerToken, false))
                        {
                            continue;
                        }

                        if (!_termFrequencies.ContainsKey(lowerToken))
                        {
                            _termFrequencies[lowerToken] = 1.0F;
                        }
                        else
                        {
                            _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1;
                        }
                    }
                }
            }
        }