Example #1
0
        /// <summary>
        /// Gets WikiLinks from a section
        /// </summary>
        /// <param name="lines"></param>
        /// <param name="source"></param>
        /// <returns>List of links for section</returns>
        public static List <WikiLink> GetWikiLinks(List <string> lines, Section source)
        {
            List <WikiLink> wikiLinks = new List <WikiLink>();

            Regex wikiLinkRE   = WikiMarkup.GetWikiLinkRE();
            Regex subsectionRE = WikiMarkup.GetSubsectionRE();
            Regex emphasisRE   = WikiMarkup.GetEmphasisRE();

            foreach (string line in lines)
            {
                MatchCollection mc = wikiLinkRE.Matches(line);
                foreach (Match linkMatch in mc)
                {
                    // create wiki link
                    string   rawLink     = linkMatch.Value;
                    string   page        = linkMatch.Groups["LinkDest"].ToString().Trim();
                    string   section     = linkMatch.Groups["Section"].ToString().Trim();
                    string   displayText = linkMatch.Groups["DisplayText"].ToString().Trim();
                    WikiLink link        = new WikiLink(rawLink, page, section, displayText, source);

                    wikiLinks.Add(link);
                }
            }

            wikiLinks.Sort();
            return(wikiLinks);
        }
Example #2
0
        /// <summary>
        /// Finds the end of a section.
        /// </summary>
        /// <param name="lines">Lines to search</param>
        /// <param name="start">Where to start search. This must be a valid section line (e.g., ==Section X==)</param>
        /// <returns>Number of last line in the section</returns>
        public static int GetSectionEnd(List <string> lines, int start)
        {
            Regex r = WikiMarkup.GetSubsectionRE();

            // get current depth
            Match m = r.Match(lines[start]);

            if (!m.Success)
            {
                throw new Exception("Invalid starting line for section");
            }

            int depth = m.Groups["SectionStart"].Length;

            // find end of section
            int subsectionEnd;

            for (subsectionEnd = start + 1; subsectionEnd < lines.Count; ++subsectionEnd)
            {
                string line = lines[subsectionEnd];
                m = r.Match(line);
                if (m.Success)
                {
                    int depthSub = m.Groups["SectionStart"].Length;
                    if (depthSub <= depth)
                    {
                        break;
                    }
                }
            }

            return(subsectionEnd - 1);
        }
Example #3
0
 /// <summary>
 /// Removes markup from this page
 /// </summary>
 public void RemoveMarkup()
 {
     WikiMarkup.ProcessMarkup(_lines);
     foreach (Section s in _sections)
     {
         s.RemoveMarkup();
     }
 }
Example #4
0
        /// <summary>
        /// Writes the titleindex table
        /// </summary>
        /// <param name="ns">Namespace to read/write from/to</param>
        /// <param name="titles">List of titles to write</param>
        /// <returns>Last title written</returns>
        public string WriteTitleIndex(Namespace ns, List <string> titles)
        {
            int nsVal = NamespaceValue(ns);

            // process titles
            for (int j = 0; j < titles.Count; ++j)
            {
                string title = titles[j];
                if (title.Trim() == "")
                {
                    continue;
                }

                string[] words = title.Split(new char[] { '_', '\\', '/' }, StringSplitOptions.RemoveEmptyEntries);

                // process words in title
                for (int k = 0; k < words.Length; ++k)
                {
                    string word = words[k];
                    word = word.Trim().ToLower();
                    word = WikiMarkup.TrimPunctuation(word);

                    if (WikiMarkup.IsStopWord(word, true))
                    {
                        continue;
                    }

                    word  = MySQLEscape(word);
                    title = MySQLEscape(title);

                    try
                    {
                        IDataReader reader = SubmitQuery("SELECT word FROM titleindex " +
                                                         "WHERE word=\"" + word + "\" AND page_namespace=" + nsVal + " AND page_title=\"" + title + "\"");

                        if (!reader.Read())
                        {
                            reader.Close();
                            ExecuteNonQuery("INSERT INTO titleindex (word, page_namespace, page_title) " +
                                            "VALUES (\"" + word + "\", " + nsVal + ", \"" + title + "\")");
                        }
                        reader.Close();
                        _lastTitleWritten = title;
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("Failed at title " + title + ", error:  " + ex);
                    }
                }
            }

            return(_lastTitleWritten);
        }
Example #5
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="rawLink">Raw text for link</param>
 /// <param name="pageTitle">Title of page being linked to</param>
 /// <param name="pageSection">Section of page being linked to</param>
 /// <param name="displayText">Display text for link</param>
 /// <param name="sourceSection">Section of source page that contains this link</param>
 public WikiLink(string rawLink, string pageTitle, string pageSection, string displayText, Section sourceSection)
 {
     _rawLink           = rawLink;
     _destPageTitle     = pageTitle;
     _destPageSection   = pageSection;
     _displayText       = displayText != "" ? displayText : (pageTitle + (_destPageSection != "" ? "#" + _destPageSection : ""));
     _displayText       = WikiMarkup.ProcessMarkup(_displayText);
     _destPageNamespace = WikiDB.Namespace.Main;
     _sourcePage        = sourceSection.ContainingPage;
     _sourceSection     = sourceSection;
     _weight            = 0;
 }
Example #6
0
        /// <summary>
        /// Gets the line number within a list of lines of the next section start
        /// </summary>
        /// <param name="lines">List of lines to search</param>
        /// <param name="start">Where to start search</param>
        /// <returns>Line number of section start</returns>
        public static int GetNextSectionStart(List <string> lines, int start)
        {
            Regex sectionRE = WikiMarkup.GetSubsectionRE();

            int sectionStart = start;

            while (sectionStart < lines.Count &&
                   !sectionRE.Match(lines[sectionStart]).Success)
            {
                ++sectionStart;
            }

            return(sectionStart);
        }
Example #7
0
        /// <summary>
        /// Constructor for marked up section
        /// </summary>
        /// <param name="name">Name of section</param>
        /// <param name="wikiTextLines">List of lines for the page</param>
        /// <param name="parentSection">Parent section</param>
        /// <param name="containingPage">Page that contains this section</param>
        public Section(string name, List <string> wikiTextLines, Section parentSection, Page containingPage)
            : this()
        {
            if (wikiTextLines == null || wikiTextLines.Count == 0)
            {
                throw new Exception("Cannot create Section with blank text");
            }

            _lines          = wikiTextLines;
            _parentSection  = parentSection;
            _name           = name;
            _containingPage = containingPage;

            // include the first line if it's not a section line (the case for headers)
            Regex sectionRE = WikiMarkup.GetSubsectionRE();
            int   secLinesStart;

            if (sectionRE.Match(_lines[0]).Success)
            {
                secLinesStart = 1;
            }
            else
            {
                secLinesStart = 0;
            }

            List <string>  subsectionLines = ExtractLines(_lines, secLinesStart, _lines.Count - secLinesStart);
            List <Section> subsections     = ExtractSections(subsectionLines, this, _containingPage);

            _subSections.AddRange(subsections);

            // get links prior to first subsection
            int           subStart           = GetNextSectionStart(_lines, secLinesStart);
            List <string> preSubsectionLines = ExtractLines(_lines, 0, subStart - 1);

            _wikiLinks = WikiLink.GetWikiLinks(preSubsectionLines, this);

            // remove markup
            WikiMarkup.ProcessMarkup(_lines);
        }
Example #8
0
        /// <summary>
        /// Extract a section and its subsections (and their subsections, etc.) from a list of lines
        /// </summary>
        /// <param name="lines">Lines to extract from</param>
        /// <param name="parent">Parent section for top-level sections</param>
        /// <param name="containingPage">Page that contains these sections</param>
        /// <returns>List of Section instances</returns>
        public static List <Section> ExtractSections(List <string> lines, Section parent, Page containingPage)
        {
            List <Section> sections = new List <Section>();

            if (lines == null || lines.Count == 0)
            {
                return(sections);
            }

            // get sections
            Regex subsectionRE = WikiMarkup.GetSubsectionRE();
            int   startLine    = GetNextSectionStart(lines, 0);

            while (startLine < lines.Count)
            {
                // check starting line
                Match m = subsectionRE.Match(lines[startLine]);
                if (!m.Success)
                {
                    throw new Exception("Invalid starting line for section");
                }

                // get the end of this section
                int endLine = GetSectionEnd(lines, startLine);

                // get lines for this section
                List <string> sectionLines = ExtractLines(lines, startLine, endLine);

                // create section, which recursively extracts subsections
                string secName = m.Groups["Name"].ToString();
                secName = WikiMarkup.ProcessMarkup(secName);
                Section s = new Section(secName, sectionLines, parent, containingPage);
                sections.Add(s);

                // get start of next section
                startLine = GetNextSectionStart(lines, endLine + 1);
            }

            return(sections);
        }
Example #9
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="title">Title of page</param>
        /// <param name="ns">Namespace of this page</param>
        /// <param name="ID">ID for this page</param>
        /// <param name="wikiText">Wiki text for page</param>
        /// <param name="database">Database containing this page</param>
        /// <param name="followRedirection">Whether or not to follow redirection</param>
        public Page(string title, WikiDB.Namespace ns, int ID, string wikiText, WikiDB database, bool followRedirection)
            : this(title, ns, ID)
        {
            // remove irrelevant markup
            wikiText = WikiMarkup.RemoveIrrelevantMarkup(wikiText);

            // split page up into lines
            _lines = Section.GetLines(wikiText);

            int           firstSectionStart = Section.GetNextSectionStart(_lines, 0);
            List <string> headerLines       = Section.ExtractLines(_lines, 0, firstSectionStart - 1);

            if (headerLines.Count > 0)
            {
                Header h = new Header(headerLines, this);
                _sections.Add(h);
            }

            // get sections
            _sections.AddRange(Section.ExtractSections(_lines, null, this));

            // check for redirect page
            string firstLine = "";

            if (_lines.Count > 0)
            {
                firstLine = _lines[0];
            }
            string redirect = "#redirect";

            if (firstLine.Length >= redirect.Length &&
                firstLine.Substring(0, redirect.Length).ToLower() == redirect &&
                WikiLinks.Count == 1 &&
                followRedirection)
            {
                // get redirect page
                string redirectURL = WikiLinks[0].DestPageURL;
                _redirectsTo = database.LookupPage(ns, redirectURL, followRedirection);
            }

            // process markup
            WikiMarkup.ProcessMarkup(_lines);

            // set line information for the page
            SetLineInfo();

            // get TF information
            foreach (Section s in _sections)
            {
                foreach (string line in s.Lines)
                {
                    string[] tokens = line.Split(' ');

                    foreach (string token in tokens)
                    {
                        // ignore case
                        string lowerToken = token.ToLower().Trim();
                        lowerToken = WikiMarkup.TrimPunctuation(lowerToken);

                        if (lowerToken == "" ||
                            WikiMarkup.IsStopWord(lowerToken, false))
                        {
                            continue;
                        }

                        if (!_termFrequencies.ContainsKey(lowerToken))
                        {
                            _termFrequencies[lowerToken] = 1.0F;
                        }
                        else
                        {
                            _termFrequencies[lowerToken] = _termFrequencies[lowerToken] + 1;
                        }
                    }
                }
            }
        }