コード例 #1
0
ファイル: Parser.cs プロジェクト: horvathpeter/wikipedia
        /// <summary>
        /// Find all disambiguation pages in raw wiki dump
        /// </summary>
        /// <param name="rawInput"></param>
        /// <param name="disambiguationPages"></param>
        private void FindDisambiguationPages(string rawInput, List <DisambiguationPageInfo> disambiguationPages)
        {
            using (XmlReader reader = XmlReader.Create(rawInput))
            {
                reader.MoveToContent();
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element)
                    {
                        if (reader.Name == "title")
                        {
                            XElement el = XNode.ReadFrom(reader) as XElement;
                            if (el.Value.Contains("(rozlišovacia stránka)"))
                            {
                                while (reader.Read())
                                {
                                    if (reader.NodeType == XmlNodeType.Element)
                                    {
                                        if (reader.Name == "text")
                                        {
                                            XElement ell = XNode.ReadFrom(reader) as XElement;

                                            var lineMatches = Regex.Matches(ell.Value, @"\*(.*?)\[\[(.*?)\]\]");
                                            List <Tuple <string, string> > pagesTitlesDescs = new List <Tuple <string, string> >();
                                            foreach (Match lineMatch in lineMatches)
                                            {
                                                //string desc = lineMatch.Groups[1].ToString().Split(',')[0];
                                                string desc  = lineMatch.Value.Replace("[[", "").Replace("]]", "");
                                                string title = lineMatch.Groups[2].ToString();
                                                pagesTitlesDescs.Add(new Tuple <string, string>(title, desc));

                                                if (title.Contains("|"))
                                                {
                                                    string[] splitedTitles = title.Split('|');
                                                    foreach (string ss in splitedTitles)
                                                    {
                                                        pagesTitlesDescs.Add(new Tuple <string, string>(ss, desc));
                                                    }
                                                }
                                            }
                                            //var matches = Regex.Matches(ell.Value, @"\[\[(.*?)\]\]");
                                            DisambiguationPageInfo dpi = new DisambiguationPageInfo(el.Value, pagesTitlesDescs);
                                            disambiguationPages.Add(dpi);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
コード例 #2
0
ファイル: ParserTest.cs プロジェクト: irfiit/wikipedia
        public void RemoveDuplicatePages_test()
        {
            // arrange
            int parsedCount;
            int expectedCount = 2;
            Parser parser = new Parser();
            List<DisambiguationPageInfo> disambiguationPages = new List<DisambiguationPageInfo>();
            DisambiguationPageInfo disambPage = new DisambiguationPageInfo();
            disambPage.title = "Muz";
            disambPage.pages.Add(new PageInfo("Peter Kis", "toto som ja", "ano toto som naozaj ja"));
            disambPage.pages.Add(new PageInfo("Peter", "toto som ja", ""));
            disambPage.pages.Add(new PageInfo("Kis", "toto som ja", ""));
            disambPage.pages.Add(new PageInfo("Karol Rastocny", "toto je on", "toto je naozaj on"));
            disambPage.pages.Add(new PageInfo("Karol", "toto je on", ""));
            disambiguationPages.Add(disambPage);

            // act
            parser.RemoveDuplicatePages(disambiguationPages);
            parsedCount = disambiguationPages[0].pages.Count;

            // assert
            Assert.AreEqual(expectedCount, parsedCount, "Wrong number of distinct parsed pages!");
        }
コード例 #3
0
ファイル: Parser.cs プロジェクト: irfiit/wikipedia
        /// <summary>
        /// Find all disambiguation pages in raw wiki dump
        /// </summary>
        /// <param name="rawInput"></param>
        /// <param name="disambiguationPages"></param>
        private void FindDisambiguationPages(string rawInput, List<DisambiguationPageInfo> disambiguationPages)
        {
            using (XmlReader reader = XmlReader.Create(rawInput))
            {
                reader.MoveToContent();
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element)
                    {
                        if (reader.Name == "title")
                        {
                            XElement el = XNode.ReadFrom(reader) as XElement;
                            if (el.Value.Contains("(rozlišovacia stránka)"))
                            {
                                while (reader.Read())
                                {
                                    if (reader.NodeType == XmlNodeType.Element)
                                    {
                                        if (reader.Name == "text")
                                        {
                                            XElement ell = XNode.ReadFrom(reader) as XElement;

                                            var lineMatches = Regex.Matches(ell.Value, @"\*(.*?)\[\[(.*?)\]\]");
                                            List<Tuple<string, string>> pagesTitlesDescs = new List<Tuple<string, string>>();
                                            foreach (Match lineMatch in lineMatches)
                                            {
                                                //string desc = lineMatch.Groups[1].ToString().Split(',')[0];
                                                string desc = lineMatch.Value.Replace("[[", "").Replace("]]", "");
                                                string title = lineMatch.Groups[2].ToString();
                                                pagesTitlesDescs.Add(new Tuple<string, string>(title, desc));

                                                if (title.Contains("|"))
                                                {
                                                    string[] splitedTitles = title.Split('|');
                                                    foreach (string ss in splitedTitles)
                                                    {
                                                        pagesTitlesDescs.Add(new Tuple<string, string>(ss, desc));
                                                    }
                                                }

                                            }
                                            //var matches = Regex.Matches(ell.Value, @"\[\[(.*?)\]\]");                                                                                        
                                            DisambiguationPageInfo dpi = new DisambiguationPageInfo(el.Value, pagesTitlesDescs);
                                            disambiguationPages.Add(dpi);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
コード例 #4
0
ファイル: Parser.cs プロジェクト: horvathpeter/wikipedia
        /// <summary>
        /// Load and show parsed XML file
        /// </summary>
        /// <param name="parsedInput"></param>
        /// <param name="disambiguationPages"></param>
        private void LoadDisambiguationPages(string parsedInput, List <DisambiguationPageInfo> disambiguationPages)
        {
            using (XmlReader reader = XmlReader.Create(parsedInput))
            {
                reader.MoveToContent();
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element)
                    {
                        if (reader.Name == "dspage")
                        {
Label:
                            DisambiguationPageInfo disambiguationPage = new DisambiguationPageInfo();
                            while (reader.Read())
                            {
                                if (reader.NodeType == XmlNodeType.Element)
                                {
                                    if (reader.Name == "title")
                                    {
                                        XElement el    = XNode.ReadFrom(reader) as XElement;
                                        string   title = el.Value;
                                        disambiguationPage.title = title;
                                    }
                                    else if (reader.Name == "page")
                                    {
                                        PageInfo page;
                                        string   title            = "";
                                        string   anchor           = "";
                                        string   shortDescription = "";
                                        string   longDescription  = "";

                                        while (reader.Read())
                                        {
                                            if (reader.NodeType == XmlNodeType.Element)
                                            {
                                                if (reader.Name == "title")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    title = el.Value;
                                                }
                                                else if (reader.Name == "anchor")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    anchor = el.Value;
                                                }
                                                else if (reader.Name == "shortDescription")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    shortDescription = el.Value;
                                                }
                                                else if (reader.Name == "longDescription")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    longDescription = el.Value;
                                                    page            = new PageInfo(title, anchor, shortDescription, longDescription);
                                                    disambiguationPage.pages.Add(page);
                                                }
                                                else if (reader.Name == "dspage")
                                                {
                                                    disambiguationPages.Add(disambiguationPage);
                                                    goto Label;
                                                }
                                            }
                                        }
                                        disambiguationPages.Add(disambiguationPage);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
コード例 #5
0
ファイル: Parser.cs プロジェクト: irfiit/wikipedia
        /// <summary>
        /// Load and show parsed XML file
        /// </summary>
        /// <param name="parsedInput"></param>
        /// <param name="disambiguationPages"></param>
        private void LoadDisambiguationPages(string parsedInput, List<DisambiguationPageInfo> disambiguationPages)
        {
            using (XmlReader reader = XmlReader.Create(parsedInput))
            {
                reader.MoveToContent();
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element)
                    {
                        if (reader.Name == "dspage")
                        {
                            Label:
                            DisambiguationPageInfo disambiguationPage = new DisambiguationPageInfo();
                            while (reader.Read())
                            {
                                if (reader.NodeType == XmlNodeType.Element)
                                {
                                    if (reader.Name == "title")
                                    {
                                        XElement el = XNode.ReadFrom(reader) as XElement;
                                        string title = el.Value;
                                        disambiguationPage.title = title;
                                    }
                                    else if (reader.Name == "page")
                                    {
                                        PageInfo page;
                                        string title = "";
                                        string anchor = "";
                                        string shortDescription = "";
                                        string longDescription = "";

                                        while (reader.Read())
                                        {
                                            if (reader.NodeType == XmlNodeType.Element)
                                            {
                                                if (reader.Name == "title")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    title = el.Value;
                                                }
                                                else if (reader.Name == "anchor")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    anchor = el.Value;
                                                }
                                                else if (reader.Name == "shortDescription")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    shortDescription = el.Value;
                                                }
                                                else if (reader.Name == "longDescription")
                                                {
                                                    XElement el = XNode.ReadFrom(reader) as XElement;
                                                    longDescription = el.Value;
                                                    page = new PageInfo(title, anchor, shortDescription, longDescription);
                                                    disambiguationPage.pages.Add(page);
                                                }
                                                else if (reader.Name == "dspage")
                                                {
                                                    disambiguationPages.Add(disambiguationPage);
                                                    goto Label;
                                                }
                                            }
                                        }
                                        disambiguationPages.Add(disambiguationPage);
                                    }
                                }
                            }
                            
                        }
                    }
                }
            }
        }