Ejemplo n.º 1
0
        /// <summary>
        /// Gets WikiLinks from a section
        /// </summary>
        /// <param name="lines"></param>
        /// <param name="source"></param>
        /// <returns>List of links for section</returns>
        public static List <WikiLink> GetWikiLinks(List <string> lines, Section source)
        {
            List <WikiLink> wikiLinks = new List <WikiLink>();

            Regex wikiLinkRE   = WikiMarkup.GetWikiLinkRE();
            Regex subsectionRE = WikiMarkup.GetSubsectionRE();
            Regex emphasisRE   = WikiMarkup.GetEmphasisRE();

            foreach (string line in lines)
            {
                MatchCollection mc = wikiLinkRE.Matches(line);
                foreach (Match linkMatch in mc)
                {
                    // create wiki link
                    string   rawLink     = linkMatch.Value;
                    string   page        = linkMatch.Groups["LinkDest"].ToString().Trim();
                    string   section     = linkMatch.Groups["Section"].ToString().Trim();
                    string   displayText = linkMatch.Groups["DisplayText"].ToString().Trim();
                    WikiLink link        = new WikiLink(rawLink, page, section, displayText, source);

                    wikiLinks.Add(link);
                }
            }

            wikiLinks.Sort();
            return(wikiLinks);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// CompareTo function
        /// </summary>
        /// <param name="obj">Object to compare this WikiLink to</param>
        /// <returns>1 if this gt obj, -1 if this lt obj, and 0 o.w.</returns>
        public int CompareTo(object obj)
        {
            if (!(obj is WikiLink))
            {
                return(1);
            }

            WikiLink wl = (WikiLink)obj;

            int weightCmp = _weight.CompareTo(wl.Weight);
            int urlCmp    = DestPageURL.CompareTo(wl.DestPageURL);

            return(weightCmp != 0 ? weightCmp : urlCmp);
        }
Ejemplo n.º 3
0
 /// <summary>
 /// Remove duplicate links from a list of links
 /// </summary>
 /// <param name="links"></param>
 public static void RemoveDuplicateLinks(List <WikiLink> links)
 {
     // remove duplicate links
     links.Sort();
     for (int i = 0; i < links.Count; ++i)
     {
         WikiLink link1 = links[i];
         for (int j = i + 1; j < links.Count;)
         {
             WikiLink link2 = links[j];
             if (link1 == link2)
             {
                 links.RemoveAt(j);
             }
             else
             {
                 break;
             }
         }
     }
 }
Ejemplo n.º 4
0
        /// <summary>
        /// Constructor for marked up section
        /// </summary>
        /// <param name="name">Name of section</param>
        /// <param name="wikiTextLines">List of lines for the page</param>
        /// <param name="parentSection">Parent section</param>
        /// <param name="containingPage">Page that contains this section</param>
        public Section(string name, List <string> wikiTextLines, Section parentSection, Page containingPage)
            : this()
        {
            if (wikiTextLines == null || wikiTextLines.Count == 0)
            {
                throw new Exception("Cannot create Section with blank text");
            }

            _lines          = wikiTextLines;
            _parentSection  = parentSection;
            _name           = name;
            _containingPage = containingPage;

            // include the first line if it's not a section line (the case for headers)
            Regex sectionRE = WikiMarkup.GetSubsectionRE();
            int   secLinesStart;

            if (sectionRE.Match(_lines[0]).Success)
            {
                secLinesStart = 1;
            }
            else
            {
                secLinesStart = 0;
            }

            List <string>  subsectionLines = ExtractLines(_lines, secLinesStart, _lines.Count - secLinesStart);
            List <Section> subsections     = ExtractSections(subsectionLines, this, _containingPage);

            _subSections.AddRange(subsections);

            // get links prior to first subsection
            int           subStart           = GetNextSectionStart(_lines, secLinesStart);
            List <string> preSubsectionLines = ExtractLines(_lines, 0, subStart - 1);

            _wikiLinks = WikiLink.GetWikiLinks(preSubsectionLines, this);

            // remove markup
            WikiMarkup.ProcessMarkup(_lines);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Equality check
        /// </summary>
        /// <param name="obj">Object to compare to</param>
        /// <returns>True if obj equals this WikiLink, False otherwise</returns>
        public override bool Equals(object obj)
        {
            if (!(obj is WikiLink))
            {
                return(false);
            }

            WikiLink wl = (WikiLink)obj;

            if (SourcePage != wl.SourcePage ||
                SourceSection.ID != wl.SourceSection.ID)
            {
                return(false);
            }

            // if we've loaded destination pages, compare them
            if (DestPage != null && wl.DestPage != null)
            {
                return(DestPage == wl.DestPage);
            }

            // otherwise, compare destination URLs
            return(DestPageURL == wl.DestPageURL);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Looks up page in database
        /// </summary>
        /// <param name="ns">Namespace to look page up in</param>
        /// <param name="title">Title of page to look up</param>
        /// <param name="followRedirection">Whether or not to look up destination of redirect pages</param>
        /// <param name="recoverSections">Whether or not to recover the section structure of the page</param>
        /// <param name="readLinks">Read link information</param>
        /// <param name="readTFTable">Read term frequency table</param>
        /// <returns>Page instance</returns>
        public Page LookupPage(WikiDB.Namespace ns, string title, bool followRedirection,
                               bool recoverSections, bool readLinks, bool readTFTable)
        {
            string url = GetURLFromPageTitle(title);

            // check page cache
            if (_pageCache.ContainsKey(url))
            {
                // see if page is dirty
                Page cached = (Page)_pageCache[url];
                bool dirty  = (recoverSections && !cached.SectionsRecovered) ||
                              (readLinks && cached.WikiLinks.Count == 0) ||
                              (readTFTable && cached.TermFrequencies.Count == 0);
                if (!dirty)
                {
                    return(cached);
                }
                else
                {
                    _pageCache.Remove(url);
                }
            }

            if (!CheckConnection(true))
            {
                throw new Exception("Could not establish connection with Wikipedia database");
            }

            Page p = null;

            url = MySQLEscape(url);
            int nsVal = NamespaceValue(ns);

            // get text and redirect page
            string selectCols = "page_id, page_text, redirects_to";

            if (recoverSections)
            {
                selectCols += ", section_layout";
            }

            string query = "SELECT " + selectCols + " FROM page " +
                           "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
            IDataReader reader = SubmitQuery(query);

            if (reader.Read())
            {
                int    id          = int.Parse(reader["page_id"].ToString());
                string text        = Encoding.UTF8.GetString((byte[])reader["page_text"]);
                string redirectsTo = reader["redirects_to"].ToString();

                // split into lines
                List <string> lines = new List <string>(text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries));
                if (recoverSections)
                {
                    string sectionLayout = reader["section_layout"].ToString();
                    p = new Page(title, ns, id, lines, sectionLayout);
                }
                else
                {
                    p       = new Page(title, ns, id);
                    p.Lines = lines;

                    // add a single section to the page
                    Section s = new Section("full page section", lines, null, p, 0, lines.Count - 1);
                    p.Sections.Add(s);
                }
                reader.Close();

                // check for page redirection
                Page redirectPage = null;
                if (redirectsTo != "" && followRedirection)
                {
                    redirectPage = LookupPage(ns, redirectsTo, followRedirection, recoverSections, readLinks, readTFTable);
                }

                p.RedirectsTo = redirectPage;
            }
            else
            {
                reader.Close();
                return(null);
            }
            reader.Close();

            // get links
            if (readLinks)
            {
                query = "SELECT link_list FROM pagelinks " +
                        "WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   linkList  = Encoding.UTF8.GetString((byte[])reader["link_list"]);
                    string[] splitList = linkList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    foreach (string link in splitList)
                    {
                        string[] entry = link.Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid link entry");
                        }

                        string   destPage        = entry[0];
                        string   sourceSectionID = recoverSections ? entry[1] : "1";
                        Section  s  = p.GetSection(sourceSectionID);
                        WikiLink wl = new WikiLink("[[" + destPage + "]]", destPage, "", destPage, s);
                        s.AddLink(wl);
                    }
                }
                reader.Close();
            }

            // get TFs
            if (readTFTable)
            {
                query  = "SELECT freq_list FROM termfreqs WHERE page_namespace=" + nsVal + " AND page_title=\"" + url + "\"";
                reader = SubmitQuery(query);

                if (reader.Read())
                {
                    string   freqList  = Encoding.UTF8.GetString((byte[])reader["freq_list"]);
                    string[] splitList = freqList.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    // get freqs
                    for (int i = 0; i < splitList.Length; ++i)
                    {
                        string[] entry = splitList[i].Split(' ');
                        if (entry.Length != 2)
                        {
                            throw new Exception("Invalid frequency entry");
                        }

                        string word = entry[0];
                        float  freq = float.Parse(entry[1]);

                        if (p.TermFrequencies.ContainsKey(word))
                        {
                            throw new Exception("Duplicate TF entry");
                        }

                        p.TermFrequencies[word] = freq;
                    }
                }
                reader.Close();
            }

            // add page to cache
            CachePage(p);

            return(p);
        }
Ejemplo n.º 7
0
 /// <summary>
 /// Adds a link to this section
 /// </summary>
 /// <param name="wl"></param>
 public void AddLink(WikiLink wl)
 {
     _wikiLinks.Add(wl);
 }