/// <summary> /// Find all disambiguation pages in raw wiki dump /// </summary> /// <param name="rawInput"></param> /// <param name="disambiguationPages"></param> private void FindDisambiguationPages(string rawInput, List <DisambiguationPageInfo> disambiguationPages) { using (XmlReader reader = XmlReader.Create(rawInput)) { reader.MoveToContent(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; if (el.Value.Contains("(rozlišovacia stránka)")) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "text") { XElement ell = XNode.ReadFrom(reader) as XElement; var lineMatches = Regex.Matches(ell.Value, @"\*(.*?)\[\[(.*?)\]\]"); List <Tuple <string, string> > pagesTitlesDescs = new List <Tuple <string, string> >(); foreach (Match lineMatch in lineMatches) { //string desc = lineMatch.Groups[1].ToString().Split(',')[0]; string desc = lineMatch.Value.Replace("[[", "").Replace("]]", ""); string title = lineMatch.Groups[2].ToString(); pagesTitlesDescs.Add(new Tuple <string, string>(title, desc)); if (title.Contains("|")) { string[] splitedTitles = title.Split('|'); foreach (string ss in splitedTitles) { pagesTitlesDescs.Add(new Tuple <string, string>(ss, desc)); } } } //var matches = Regex.Matches(ell.Value, @"\[\[(.*?)\]\]"); DisambiguationPageInfo dpi = new DisambiguationPageInfo(el.Value, pagesTitlesDescs); disambiguationPages.Add(dpi); break; } } } } } } } } }
public void RemoveDuplicatePages_test() { // arrange int parsedCount; int expectedCount = 2; Parser parser = new Parser(); List<DisambiguationPageInfo> disambiguationPages = new List<DisambiguationPageInfo>(); DisambiguationPageInfo disambPage = new DisambiguationPageInfo(); disambPage.title = "Muz"; disambPage.pages.Add(new PageInfo("Peter Kis", "toto som ja", "ano toto som naozaj ja")); disambPage.pages.Add(new PageInfo("Peter", "toto som ja", "")); disambPage.pages.Add(new PageInfo("Kis", "toto som ja", "")); disambPage.pages.Add(new PageInfo("Karol Rastocny", "toto je on", "toto je naozaj on")); disambPage.pages.Add(new PageInfo("Karol", "toto je on", "")); disambiguationPages.Add(disambPage); // act parser.RemoveDuplicatePages(disambiguationPages); parsedCount = disambiguationPages[0].pages.Count; // assert Assert.AreEqual(expectedCount, parsedCount, "Wrong number of distinct parsed pages!"); }
/// <summary> /// Find all disambiguation pages in raw wiki dump /// </summary> /// <param name="rawInput"></param> /// <param name="disambiguationPages"></param> private void FindDisambiguationPages(string rawInput, List<DisambiguationPageInfo> disambiguationPages) { using (XmlReader reader = XmlReader.Create(rawInput)) { reader.MoveToContent(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; if (el.Value.Contains("(rozlišovacia stránka)")) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "text") { XElement ell = XNode.ReadFrom(reader) as XElement; var lineMatches = Regex.Matches(ell.Value, @"\*(.*?)\[\[(.*?)\]\]"); List<Tuple<string, string>> pagesTitlesDescs = new List<Tuple<string, string>>(); foreach (Match lineMatch in lineMatches) { //string desc = lineMatch.Groups[1].ToString().Split(',')[0]; string desc = lineMatch.Value.Replace("[[", "").Replace("]]", ""); string title = lineMatch.Groups[2].ToString(); pagesTitlesDescs.Add(new Tuple<string, string>(title, desc)); if (title.Contains("|")) { string[] splitedTitles = title.Split('|'); foreach (string ss in splitedTitles) { pagesTitlesDescs.Add(new Tuple<string, string>(ss, desc)); } } } //var matches = Regex.Matches(ell.Value, @"\[\[(.*?)\]\]"); DisambiguationPageInfo dpi = new DisambiguationPageInfo(el.Value, pagesTitlesDescs); disambiguationPages.Add(dpi); break; } } } } } } } } }
/// <summary> /// Load and show parsed XML file /// </summary> /// <param name="parsedInput"></param> /// <param name="disambiguationPages"></param> private void LoadDisambiguationPages(string parsedInput, List <DisambiguationPageInfo> disambiguationPages) { using (XmlReader reader = XmlReader.Create(parsedInput)) { reader.MoveToContent(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "dspage") { Label: DisambiguationPageInfo disambiguationPage = new DisambiguationPageInfo(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; string title = el.Value; disambiguationPage.title = title; } else if (reader.Name == "page") { PageInfo page; string title = ""; string anchor = ""; string shortDescription = ""; string longDescription = ""; while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; title = el.Value; } else if (reader.Name == "anchor") { XElement el = XNode.ReadFrom(reader) as XElement; anchor = el.Value; } else if (reader.Name == "shortDescription") { XElement el = XNode.ReadFrom(reader) as XElement; shortDescription = el.Value; } else if (reader.Name == "longDescription") { XElement el = XNode.ReadFrom(reader) as XElement; longDescription = el.Value; page = new PageInfo(title, anchor, shortDescription, longDescription); disambiguationPage.pages.Add(page); } else if (reader.Name == "dspage") { disambiguationPages.Add(disambiguationPage); goto Label; } } } disambiguationPages.Add(disambiguationPage); } } } } } } } }
/// <summary> /// Load and show parsed XML file /// </summary> /// <param name="parsedInput"></param> /// <param name="disambiguationPages"></param> private void LoadDisambiguationPages(string parsedInput, List<DisambiguationPageInfo> disambiguationPages) { using (XmlReader reader = XmlReader.Create(parsedInput)) { reader.MoveToContent(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "dspage") { Label: DisambiguationPageInfo disambiguationPage = new DisambiguationPageInfo(); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; string title = el.Value; disambiguationPage.title = title; } else if (reader.Name == "page") { PageInfo page; string title = ""; string anchor = ""; string shortDescription = ""; string longDescription = ""; while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "title") { XElement el = XNode.ReadFrom(reader) as XElement; title = el.Value; } else if (reader.Name == "anchor") { XElement el = XNode.ReadFrom(reader) as XElement; anchor = el.Value; } else if (reader.Name == "shortDescription") { XElement el = XNode.ReadFrom(reader) as XElement; shortDescription = el.Value; } else if (reader.Name == "longDescription") { XElement el = XNode.ReadFrom(reader) as XElement; longDescription = el.Value; page = new PageInfo(title, anchor, shortDescription, longDescription); disambiguationPage.pages.Add(page); } else if (reader.Name == "dspage") { disambiguationPages.Add(disambiguationPage); goto Label; } } } disambiguationPages.Add(disambiguationPage); } } } } } } } }