예제 #1
0
        /// <summary>
        /// Extracts categories from the article text; removes duplicate categories, cleans whitespace and underscores
        /// </summary>
        /// <param name="articleText"></param>
        /// <param name="articleTitle"></param>
        /// <returns></returns>
        public string RemoveCats(ref string articleText, string articleTitle)
        {
            List <string> categoryList = new List <string>();

            // http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Comments_get_removed_from_between_categories
            // allow comments between categories, and keep them in the same place, but don't grab any comment just after the last category
            Regex r = new Regex("<!-- ? ?\\[\\[" + Variables.NamespacesCaseInsensitive[Namespace.Category]
                                + ".*?(\\]\\]|\\|.*?\\]\\]).*?-->|\\[\\["
                                + Variables.NamespacesCaseInsensitive[Namespace.Category]
                                + ".*?(\\]\\]|\\|.*?\\]\\])( {0,4}⌊⌊⌊⌊[0-9]{1,4}⌋⌋⌋⌋|\\s*<!--.*?-->(?=\r\n\\[\\[))?", RegexOptions.Singleline);

            MatchCollection matches = r.Matches(articleText);

            foreach (Match m in matches)
            {
                if (!Regex.IsMatch(m.Value, "\\[\\[Category:(Pages|Categories|Articles) for deletion\\]\\]"))
                {
                    categoryList.Add(m.Value);
                }
            }

            articleText = Tools.RemoveMatches(articleText, matches);

            if (AddCatKey)
            {
                categoryList = CatKeyer(categoryList, articleTitle);
            }

            if (CatCommentRegex.IsMatch(articleText))
            {
                string catComment = CatCommentRegex.Match(articleText).Value;
                articleText = articleText.Replace(catComment, "");
                categoryList.Insert(0, catComment);
            }

            MatchCollection mc = WikiRegexes.Defaultsort.Matches(articleText);

            if (mc.Count > 1)
            {
                throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled");
            }

            string defaultSort = "";

            if (mc.Count > 0)
            {
                defaultSort = mc[0].Value;
            }

            if (!string.IsNullOrEmpty(defaultSort))
            {
                articleText = articleText.Replace(defaultSort, "");
            }

            if (!string.IsNullOrEmpty(defaultSort) && defaultSort.ToUpper().Contains("DEFAULTSORT"))
            {
                defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort);
            }
            if (!string.IsNullOrEmpty(defaultSort))
            {
                defaultSort += "\r\n";
            }

            // on en-wiki find any {{Lifetime}} template and move directly after categories
            string lifetime = "";

            if (Variables.LangCode == LangCodeEnum.en)
            {
                lifetime = WikiRegexes.Lifetime.Match(articleText).Value;

                if (!string.IsNullOrEmpty(lifetime))
                {
                    articleText = articleText.Replace(lifetime, "");

                    // http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Blank_lines_after_Lifetime
                    lifetime += "\r\n";
                }
            }

            return(defaultSort + ListToString(categoryList) + lifetime);
        }
예제 #2
0
        /// <summary>
        /// Extracts categories from the article text; removes duplicate categories, cleans whitespace and underscores
        /// </summary>
        /// <param name="articleText">The wiki text of the article.</param>
        /// <param name="articleTitle">Title of the article</param>
        /// <returns>The cleaned page categories in a single string</returns>
        public string RemoveCats(ref string articleText, string articleTitle)
        {
            List <string> categoryList        = new List <string>();
            string        originalArticleText = articleText;

            // allow comments between categories, and keep them in the same place, but don't grab any comment just after the last category
            Regex r = new Regex(@"<!-- [^<>]*?\[\[\s*" + Variables.NamespacesCaseInsensitive[Namespace.Category]
                                + @".*?(\]\]|\|.*?\]\]).*?-->|\[\["
                                + Variables.NamespacesCaseInsensitive[Namespace.Category]
                                + @".*?(\]\]|\|.*?\]\])(\s*⌊⌊⌊⌊\d{1,4}⌋⌋⌋⌋|\s*<!--.*?-->(?=\r\n\[\[\s*" + Variables.NamespacesCaseInsensitive[Namespace.Category]
                                + @"))?", RegexOptions.Singleline);

            MatchCollection matches = r.Matches(articleText);

            foreach (Match m in matches)
            {
                if (!Regex.IsMatch(m.Value, @"\[\[Category:(Pages|Categories|Articles) for deletion\]\]"))
                {
                    categoryList.Add(m.Value);
                }
            }

            articleText = Tools.RemoveMatches(articleText, matches);

            // if category tidying has changed comments/nowikis return with no changes – we've pulled a cat from a comment
            if (!UnformattedTextNotChanged(originalArticleText, articleText))
            {
                articleText = originalArticleText;
                return("");
            }

            if (AddCatKey)
            {
                categoryList = CatKeyer(categoryList, articleTitle);
            }

            if (CatCommentRegex.IsMatch(articleText))
            {
                string catComment = CatCommentRegex.Match(articleText).Value;
                articleText = articleText.Replace(catComment, "");
                categoryList.Insert(0, catComment);
            }

            MatchCollection mc = WikiRegexes.Defaultsort.Matches(articleText);

            if (mc.Count > 1)
            {
                throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled");
            }

            string defaultSort = "";

            // ignore commented out DEFAULTSORT – http://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs#Moving_DEFAULTSORT_in_HTML_comments
            if (mc.Count > 0 && WikiRegexes.Defaultsort.Matches(WikiRegexes.Comments.Replace(articleText, "")).Count == mc.Count)
            {
                defaultSort = mc[0].Value;
            }

            if (!string.IsNullOrEmpty(defaultSort))
            {
                articleText = articleText.Replace(defaultSort, "");
            }

            if (!string.IsNullOrEmpty(defaultSort) && defaultSort.ToUpper().Contains("DEFAULTSORT"))
            {
                defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort);
            }
            if (!string.IsNullOrEmpty(defaultSort))
            {
                defaultSort += "\r\n";
            }

            return(defaultSort + ListToString(categoryList));
        }
        public string removeCats(ref string ArticleText, string ArticleTitle)
        {
            List <string> categoryList = new List <string>();
            string        x            = "";

            Regex r = new Regex("<!-- ? ?\\[\\[" + Variables.NamespacesCaseInsensitive[14] + ".*?(\\]\\]|\\|.*?\\]\\]).*?-->|\\[\\[" + Variables.NamespacesCaseInsensitive[14] + ".*?(\\]\\]|\\|.*?\\]\\])( {0,4}⌊⌊⌊⌊[0-9]{1,4}⌋⌋⌋⌋)?");

            MatchCollection matches = r.Matches(ArticleText);

            foreach (Match m in matches)
            {
                x = m.Value;
                //add to array, replace underscores with spaces, ignore=
                if (!Regex.IsMatch(x, "\\[\\[Category:(Pages|Categories|Articles) for deletion\\]\\]"))
                {
                    categoryList.Add(x.Replace("_", " "));
                }
            }

            ArticleText = Tools.RemoveMatches(ArticleText, matches);

            if (parser.addCatKey)
            {
                categoryList = catKeyer(categoryList, ArticleTitle);
            }

            if (CatCommentRegex.IsMatch(ArticleText))
            {
                string catComment = CatCommentRegex.Match(ArticleText).Value;
                ArticleText = ArticleText.Replace(catComment, "");
                categoryList.Insert(0, catComment);
            }

            MatchCollection mc = WikiRegexes.Defaultsort.Matches(ArticleText);

            if (mc.Count > 1)
            {
                throw new ArgumentException("Page contains multiple {{DEFAULTSORTS}} tags. Metadata sorting cancelled");
            }

            string defaultSort = "";

            if (mc.Count > 0)
            {
                defaultSort = mc[0].Value;
            }

            if (!string.IsNullOrEmpty(defaultSort))
            {
                ArticleText = ArticleText.Replace(defaultSort, "");
            }

            if (defaultSort.ToUpper().Contains("DEFAULTSORT"))
            {
                defaultSort = TalkPageHeaders.FormatDefaultSort(defaultSort);
            }
            if (!string.IsNullOrEmpty(defaultSort))
            {
                defaultSort += "\r\n";
            }

            return(defaultSort + ListToString(categoryList));
        }