/// <summary> /// Extracts categories from the article text; removes duplicate categories, cleans whitespace and underscores /// </summary> /// <param name="articleText"></param> /// <param name="articleTitle"></param> /// <returns></returns> public string RemoveCats(ref string articleText, string articleTitle) { List <string> categoryList = new List <string>(); // http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Comments_get_removed_from_between_categories // allow comments between categories, and keep them in the same place, but don't grab any comment just after the last category Regex r = new Regex("<!-- ? ?\\[\\[" + Variables.NamespacesCaseInsensitive[Namespace.Category] + ".*?(\\]\\]|\\|.*?\\]\\]).*?-->|\\[\\[" + Variables.NamespacesCaseInsensitive[Namespace.Category] + ".*?(\\]\\]|\\|.*?\\]\\])( {0,4}⌊⌊⌊⌊[0-9]{1,4}⌋⌋⌋⌋|\\s*<!--.*?-->(?=\r\n\\[\\[))?", RegexOptions.Singleline); MatchCollection matches = r.Matches(articleText); foreach (Match m in matches) { if (!Regex.IsMatch(m.Value, "\\[\\[Category:(Pages|Categories|Articles) for deletion\\]\\]")) { categoryList.Add(m.Value); } } articleText = Tools.RemoveMatches(articleText, matches); if (AddCatKey) { categoryList = CatKeyer(categoryList, articleTitle); } if (CatCommentRegex.IsMatch(articleText)) { string catComment = CatCommentRegex.Match(articleText).Value; articleText = articleText.Replace(catComment, ""); categoryList.Insert(0, catComment); } MatchCollection mc = WikiRegexes.Defaultsort.Matches(articleText); if (mc.Count > 1) { throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled"); } string defaultSort = ""; if (mc.Count > 0) { defaultSort = mc[0].Value; } if (!string.IsNullOrEmpty(defaultSort)) { articleText = articleText.Replace(defaultSort, ""); } if (!string.IsNullOrEmpty(defaultSort) && defaultSort.ToUpper().Contains("DEFAULTSORT")) { defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort); } if (!string.IsNullOrEmpty(defaultSort)) { defaultSort += "\r\n"; } // on en-wiki find any {{Lifetime}} template and move directly after categories string lifetime = ""; if (Variables.LangCode == LangCodeEnum.en) { lifetime = WikiRegexes.Lifetime.Match(articleText).Value; if (!string.IsNullOrEmpty(lifetime)) { articleText = articleText.Replace(lifetime, ""); // http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Blank_lines_after_Lifetime lifetime += "\r\n"; } } return(defaultSort + ListToString(categoryList) + lifetime); }
/// <summary> /// Extracts categories from the article text; removes duplicate categories, cleans whitespace and underscores /// </summary> /// <param name="articleText">The wiki text of the article.</param> /// <param name="articleTitle">Title of the article</param> /// <returns>The cleaned page categories in a single string</returns> public string RemoveCats(ref string articleText, string articleTitle) { List <string> categoryList = new List <string>(); string originalArticleText = articleText; // allow comments between categories, and keep them in the same place, but don't grab any comment just after the last category Regex r = new Regex(@"<!-- [^<>]*?\[\[\s*" + Variables.NamespacesCaseInsensitive[Namespace.Category] + @".*?(\]\]|\|.*?\]\]).*?-->|\[\[" + Variables.NamespacesCaseInsensitive[Namespace.Category] + @".*?(\]\]|\|.*?\]\])(\s*⌊⌊⌊⌊\d{1,4}⌋⌋⌋⌋|\s*<!--.*?-->(?=\r\n\[\[\s*" + Variables.NamespacesCaseInsensitive[Namespace.Category] + @"))?", RegexOptions.Singleline); MatchCollection matches = r.Matches(articleText); foreach (Match m in matches) { if (!Regex.IsMatch(m.Value, @"\[\[Category:(Pages|Categories|Articles) for deletion\]\]")) { categoryList.Add(m.Value); } } articleText = Tools.RemoveMatches(articleText, matches); // if category tidying has changed comments/nowikis return with no changes – we've pulled a cat from a comment if (!UnformattedTextNotChanged(originalArticleText, articleText)) { articleText = originalArticleText; return(""); } if (AddCatKey) { categoryList = CatKeyer(categoryList, articleTitle); } if (CatCommentRegex.IsMatch(articleText)) { string catComment = CatCommentRegex.Match(articleText).Value; articleText = articleText.Replace(catComment, ""); categoryList.Insert(0, catComment); } MatchCollection mc = WikiRegexes.Defaultsort.Matches(articleText); if (mc.Count > 1) { throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled"); } string defaultSort = ""; // ignore commented out DEFAULTSORT – http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Moving_DEFAULTSORT_in_HTML_comments if (mc.Count > 0 && WikiRegexes.Defaultsort.Matches(WikiRegexes.Comments.Replace(articleText, "")).Count == mc.Count) { defaultSort = mc[0].Value; } if (!string.IsNullOrEmpty(defaultSort)) { articleText = articleText.Replace(defaultSort, ""); } if (!string.IsNullOrEmpty(defaultSort) && defaultSort.ToUpper().Contains("DEFAULTSORT")) { defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort); } if (!string.IsNullOrEmpty(defaultSort)) { defaultSort += "\r\n"; } return(defaultSort + ListToString(categoryList)); }
public string removeCats(ref string ArticleText, string ArticleTitle) { List <string> categoryList = new List <string>(); string x = ""; Regex r = new Regex("<!-- ? ?\\[\\[" + Variables.NamespacesCaseInsensitive[14] + ".*?(\\]\\]|\\|.*?\\]\\]).*?-->|\\[\\[" + Variables.NamespacesCaseInsensitive[14] + ".*?(\\]\\]|\\|.*?\\]\\])( {0,4}⌊⌊⌊⌊[0-9]{1,4}⌋⌋⌋⌋)?"); MatchCollection matches = r.Matches(ArticleText); foreach (Match m in matches) { x = m.Value; //add to array, replace underscores with spaces, ignore= if (!Regex.IsMatch(x, "\\[\\[Category:(Pages|Categories|Articles) for deletion\\]\\]")) { categoryList.Add(x.Replace("_", " ")); } } ArticleText = Tools.RemoveMatches(ArticleText, matches); if (parser.addCatKey) { categoryList = catKeyer(categoryList, ArticleTitle); } if (CatCommentRegex.IsMatch(ArticleText)) { string catComment = CatCommentRegex.Match(ArticleText).Value; ArticleText = ArticleText.Replace(catComment, ""); categoryList.Insert(0, catComment); } MatchCollection mc = WikiRegexes.Defaultsort.Matches(ArticleText); if (mc.Count > 1) { throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled"); } string defaultSort = ""; if (mc.Count > 0) { defaultSort = mc[0].Value; } if (!string.IsNullOrEmpty(defaultSort)) { ArticleText = ArticleText.Replace(defaultSort, ""); } if (defaultSort.ToUpper().Contains("DEFAULTSORT")) { defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort); } if (!string.IsNullOrEmpty(defaultSort)) { defaultSort += "\r\n"; } return(defaultSort + ListToString(categoryList)); }