Beispiel #1
0
        /// <summary>
        /// Adds [[Category:XXXX births]], [[Category:XXXX deaths]] to articles about people where available, for en-wiki only
        /// When page is not mainspace, adds [[:Category rather than [[Category
        /// Removes Date of birth missing/Date of birth missing (living people) category if full DOB in {{birth date and age}}
        /// </summary>
        /// <param name="articleText">The wiki text of the article.</param>
        /// <param name="articleTitle">Title of the article</param>
        /// <param name="parseTalkPage"></param>
        /// <returns></returns>
        public static string FixPeopleCategories(string articleText, string articleTitle, bool parseTalkPage)
        {
            if (!Variables.LangCode.Equals("en"))
            {
                return(articleText);
            }

            // Performance: apply births/deaths category check on categories string, not whole article
            string cats = GetCats(articleText);

            bool dolmatch = WikiRegexes.DeathsOrLivingCategory.IsMatch(cats),
                 bimatch  = WikiRegexes.BirthsCategory.IsMatch(cats);

            // no work to do if already has a birth and a death/living cat
            if (dolmatch && bimatch)
            {
                return(YearOfBirthDeathMissingCategory(articleText, cats));
            }

            // over 20 references or long and not DOB/DOD categorised at all yet: implausible
            if ((articleText.Length > 15000 && !bimatch && !dolmatch) || (!dolmatch && WikiRegexes.Refs.Matches(articleText).Count > 20))
            {
                return(YearOfBirthDeathMissingCategory(articleText, cats));
            }

            string articleTextBefore = articleText;
            int    catCount          = WikiRegexes.Category.Matches(articleText).Count;

            // get the zeroth section (text upto first heading)
            string zerothSection = Tools.GetZerothSection(articleText);

            // remove references and long wikilinks (but allow an ISO date) that may contain false positives of birth/death date
            zerothSection = WikiRegexes.Refs.Replace(zerothSection, " ");
            while (LongWikilink.IsMatch(zerothSection))
            {
                zerothSection = LongWikilink.Replace(zerothSection, " ");
            }

            // ignore dates containing years from dated maintenance tags etc.
            zerothSection = WikiRegexes.NestedTemplates.Replace(zerothSection, m2 => ThreeOrFourDigitNumber.IsMatch(Tools.GetTemplateParameterValue(m2.Value, "date")) ? "" : m2.Value);
            zerothSection = WikiRegexes.TemplateMultiline.Replace(zerothSection, m2 => ThreeOrFourDigitNumber.IsMatch(Tools.GetTemplateParameterValue(m2.Value, "date")) ? "" : m2.Value);

            string StartCategory = Tools.Newline(@"[[" + (Namespace.IsMainSpace(articleTitle) ? "" : ":") + @"Category:");
            string yearstring, yearFromInfoBox = "", sort = GetCategorySort(articleText);

            bool alreadyUncertain = false;

            // scrape any infobox for birth year, ignore {{Birth date based on age at death}}
            string fromInfoBox = GetInfoBoxFieldValue(BirthDateBasedOnAgeAtDeath.Replace(zerothSection, ""), WikiRegexes.InfoBoxDOBFields);

            // ignore as of dates
            if (AsOfText.IsMatch(fromInfoBox))
            {
                fromInfoBox = fromInfoBox.Substring(0, AsOfText.Match(fromInfoBox).Index);
            }

            if (fromInfoBox.Length > 0 && !UncertainWordings.IsMatch(fromInfoBox))
            {
                yearFromInfoBox = YearPossiblyWithBC.Match(fromInfoBox).Value;
            }

            // convert [[:Category to [[Category for non-mainspace Category checking
            string checkText = Namespace.IsMainSpace(articleTitle) ? articleText : articleText.Replace("[[:", "[[");

            // birth
            if (!WikiRegexes.BirthsCategory.IsMatch(checkText) && (PersonYearOfBirth.Matches(zerothSection).Count == 1 ||
                                                                   WikiRegexes.DateBirthAndAge.IsMatch(zerothSection) ||
                                                                   WikiRegexes.DeathDateAndAge.IsMatch(zerothSection) ||
                                                                   ThreeOrFourDigitNumber.IsMatch(yearFromInfoBox)))
            {
                // look for '{{birth date...' template first
                yearstring = WikiRegexes.DateBirthAndAge.Match(articleText).Groups[1].Value;

                // look for '{{death date and age' template second
                if (String.IsNullOrEmpty(yearstring))
                {
                    yearstring = WikiRegexes.DeathDateAndAge.Match(articleText).Groups[2].Value;
                }

                // thirdly use yearFromInfoBox
                if (ThreeOrFourDigitNumber.IsMatch(yearFromInfoBox))
                {
                    yearstring = yearFromInfoBox;
                }

                // look for '(born xxxx)'
                if (String.IsNullOrEmpty(yearstring))
                {
                    Match m = PersonYearOfBirth.Match(zerothSection);

                    // remove part beyond dash or died
                    string birthpart = DiedOrBaptised.Replace(m.Value, "$1");

                    if (WikiRegexes.CircaTemplate.IsMatch(birthpart))
                    {
                        alreadyUncertain = true;
                    }

                    birthpart = WikiRegexes.TemplateMultiline.Replace(birthpart, " ");

                    // check born info before any untemplated died info
                    if (!(m.Index > PersonYearOfDeath.Match(zerothSection).Index) || !PersonYearOfDeath.IsMatch(zerothSection))
                    {
                        // when there's only an approximate birth year, add the appropriate cat rather than the xxxx birth one
                        if (UncertainWordings.IsMatch(birthpart) || alreadyUncertain || FloruitTemplate.IsMatch(birthpart))
                        {
                            if (!CategoryMatch(articleText, YearOfBirthMissingLivingPeople) && !CategoryMatch(articleText, YearOfBirthUncertain))
                            {
                                articleText += StartCategory + YearOfBirthUncertain + CatEnd(sort);
                            }
                        }
                        else // after removing dashes, birthpart must still contain year
                        if (!birthpart.Contains(@"?") && Regex.IsMatch(birthpart, @"\d{3,4}"))
                        {
                            yearstring = m.Groups[1].Value;
                        }
                    }
                }

                // per [[:Category:Living people]], don't apply birth category if born > 121 years ago
                // validate a YYYY date is not in the future
                if (!string.IsNullOrEmpty(yearstring) && yearstring.Length > 2 &&
                    (!YearOnly.IsMatch(yearstring) || Convert.ToInt32(yearstring) <= DateTime.Now.Year) &&
                    !(articleText.Contains(CategoryLivingPeople) && Convert.ToInt32(yearstring) < (DateTime.Now.Year - 121)))
                {
                    articleText += StartCategory + yearstring + " births" + CatEnd(sort);
                }
            }

            // scrape any infobox
            yearFromInfoBox = "";
            fromInfoBox     = GetInfoBoxFieldValue(articleText, WikiRegexes.InfoBoxDODFields);

            if (fromInfoBox.Length > 0 && !UncertainWordings.IsMatch(fromInfoBox))
            {
                yearFromInfoBox = YearPossiblyWithBC.Match(fromInfoBox).Value;
            }

            checkText = Namespace.IsMainSpace(articleTitle) ? articleText : articleText.Replace("[[:", "[[");
            if (!WikiRegexes.DeathsOrLivingCategory.IsMatch(RemoveCategory(YearofDeathMissing, checkText)) && (PersonYearOfDeath.IsMatch(zerothSection) || WikiRegexes.DeathDate.IsMatch(zerothSection) ||
                                                                                                               ThreeOrFourDigitNumber.IsMatch(yearFromInfoBox)))
            {
                // look for '{{death date...' template first
                yearstring = WikiRegexes.DeathDate.Match(articleText).Groups[1].Value;

                // secondly use yearFromInfoBox
                if (ThreeOrFourDigitNumber.IsMatch(yearFromInfoBox))
                {
                    yearstring = yearFromInfoBox;
                }

                // look for '(died xxxx)'
                if (string.IsNullOrEmpty(yearstring))
                {
                    Match m = PersonYearOfDeath.Match(zerothSection);

                    // check died info after any untemplated born info
                    if (m.Index >= PersonYearOfBirth.Match(zerothSection).Index || !PersonYearOfBirth.IsMatch(zerothSection))
                    {
                        if (!UncertainWordings.IsMatch(m.Value) && !m.Value.Contains(@"?"))
                        {
                            yearstring = m.Groups[1].Value;
                        }
                    }
                }

                // validate a YYYY date is not in the future
                if (!string.IsNullOrEmpty(yearstring) && yearstring.Length > 2 &&
                    (!YearOnly.IsMatch(yearstring) || Convert.ToInt32(yearstring) <= DateTime.Now.Year))
                {
                    articleText += StartCategory + yearstring + " deaths" + CatEnd(sort);
                }
            }

            zerothSection = NotCircaTemplate.Replace(zerothSection, " ");
            // birth and death combined
            // if not fully categorised, check it
            if (PersonYearOfBirthAndDeath.IsMatch(zerothSection) && (!WikiRegexes.BirthsCategory.IsMatch(articleText) || !WikiRegexes.DeathsOrLivingCategory.IsMatch(articleText)))
            {
                Match m = PersonYearOfBirthAndDeath.Match(zerothSection);

                string birthyear    = m.Groups[1].Value;
                int    birthyearint = int.Parse(birthyear);

                string deathyear    = m.Groups[3].Value;
                int    deathyearint = int.Parse(deathyear);

                // logical valdiation of dates
                if (birthyearint <= deathyearint && (deathyearint - birthyearint) <= 125)
                {
                    string birthpart = zerothSection.Substring(m.Index, m.Groups[2].Index - m.Index),
                           deathpart = zerothSection.Substring(m.Groups[2].Index, (m.Value.Length + m.Index) - m.Groups[2].Index);

                    if (!WikiRegexes.BirthsCategory.IsMatch(articleText))
                    {
                        if (!UncertainWordings.IsMatch(birthpart) && !ReignedRuledUnsure.IsMatch(m.Value) && !Regex.IsMatch(birthpart, @"(?:[Dd](?:ied|\.)|baptised)") &&
                            !FloruitTemplate.IsMatch(birthpart))
                        {
                            articleText += StartCategory + birthyear + @" births" + CatEnd(sort);
                        }
                        else
                        if (UncertainWordings.IsMatch(birthpart) && !CategoryMatch(articleText, YearOfBirthMissingLivingPeople) && !CategoryMatch(articleText, YearOfBirthUncertain))
                        {
                            articleText += StartCategory + YearOfBirthUncertain + CatEnd(sort);
                        }
                    }

                    if (!UncertainWordings.IsMatch(deathpart) && !ReignedRuledUnsure.IsMatch(m.Value) && !Regex.IsMatch(deathpart, @"[Bb](?:orn|\.)") && !Regex.IsMatch(birthpart, @"[Dd](?:ied|\.)") &&
                        (!WikiRegexes.DeathsOrLivingCategory.IsMatch(articleText) || CategoryMatch(articleText, YearofDeathMissing)))
                    {
                        articleText += StartCategory + deathyear + @" deaths" + CatEnd(sort);
                    }
                }
            }

            // do this check last as IsArticleAboutAPerson can be relatively slow
            if (!articleText.Equals(articleTextBefore) && !IsArticleAboutAPerson(articleTextBefore, articleTitle, parseTalkPage))
            {
                return(YearOfBirthDeathMissingCategory(articleTextBefore, cats));
            }

            // {{uncat}} --> {{Improve categories}} if we've added cats
            if (WikiRegexes.Category.Matches(articleText).Count > catCount && WikiRegexes.Uncat.IsMatch(articleText) &&
                !WikiRegexes.CatImprove.IsMatch(articleText))
            {
                articleText = Tools.RenameTemplate(articleText, WikiRegexes.Uncat.Match(articleText).Groups[1].Value, "Improve categories");
            }

            return(YearOfBirthDeathMissingCategory(articleText, GetCats(articleText)));
        }
Beispiel #2
0
        /// <summary>
        /// Sets persondata date of birth/death fields based on unformatted info in zeroth section of article, provided dates match existing birth/death categories
        /// </summary>
        /// <param name="personData">Persondata template call</param>
        /// <param name="articletext">The article text</param>
        /// <returns>The updated persondata template call</returns>
        private static string CompletePersonDataDate(string personData, string articletext)
        {
            // get the existing values
            string existingBirthYear = Tools.GetTemplateParameterValue(personData, "DATE OF BIRTH", true);
            string existingDeathYear = Tools.GetTemplateParameterValue(personData, "DATE OF DEATH", true);

            if (existingBirthYear.Length == 4 || existingDeathYear.Length == 4)
            {
                Parsers p = new Parsers();
                string  birthDateFound = "", deathDateFound = "";
                string  zerothSection = Tools.GetZerothSection(articletext);

                // remove references, wikilinks, templates
                zerothSection = WikiRegexes.Refs.Replace(zerothSection, " ");
                zerothSection = WikiRegexes.SimpleWikiLink.Replace(zerothSection, " ");

                if (WikiRegexes.CircaTemplate.IsMatch(zerothSection))
                {
                    zerothSection = zerothSection.Substring(0, WikiRegexes.CircaTemplate.Match(zerothSection).Index);
                }

                zerothSection = Tools.NestedTemplateRegex("ndash").Replace(zerothSection, " &ndash;");
                zerothSection = WikiRegexes.NestedTemplates.Replace(zerothSection, " ");
                // clean up any format errors in birth/death dates we may want to use
                zerothSection = p.FixDatesAInternal(zerothSection);

                // look for date in bracketed text, check date matches existing value (from categories)
                foreach (Match m in BracketedBirthDeathDate.Matches(zerothSection))
                {
                    string bValue = m.Value;

                    if (!UncertainWordings.IsMatch(bValue) && !ReignedRuledUnsure.IsMatch(bValue) && !FloruitTemplate.IsMatch(bValue))
                    {
                        string bBorn, bDied = "";
                        // split on died/spaced dash
                        if (FreeFormatDied.IsMatch(bValue))
                        {
                            bBorn = bValue.Substring(0, FreeFormatDied.Match(bValue).Index);
                            bDied = bValue.Substring(FreeFormatDied.Match(bValue).Index);
                        }
                        else
                        {
                            bBorn = bValue;
                        }

                        // born
                        if (existingBirthYear.Length == 4)
                        {
                            if (WikiRegexes.AmericanDates.Matches(bBorn).Count == 1 && WikiRegexes.AmericanDates.Match(bBorn).Value.Contains(existingBirthYear))
                            {
                                birthDateFound = WikiRegexes.AmericanDates.Match(bBorn).Value;
                            }
                            else if (WikiRegexes.InternationalDates.Matches(bBorn).Count == 1 && WikiRegexes.InternationalDates.Match(bBorn).Value.Contains(existingBirthYear))
                            {
                                birthDateFound = WikiRegexes.InternationalDates.Match(bBorn).Value;
                            }
                        }

                        // died
                        if (existingDeathYear.Length == 4)
                        {
                            if (WikiRegexes.AmericanDates.Matches(bDied).Count == 1 && WikiRegexes.AmericanDates.Match(bDied).Value.Contains(existingDeathYear))
                            {
                                deathDateFound = WikiRegexes.AmericanDates.Match(bDied).Value;
                            }
                            else if (WikiRegexes.InternationalDates.Matches(bDied).Count == 1 && WikiRegexes.InternationalDates.Match(bDied).Value.Contains(existingDeathYear))
                            {
                                deathDateFound = WikiRegexes.InternationalDates.Match(bDied).Value;
                            }
                        }

                        if (birthDateFound.Length > 0 || deathDateFound.Length > 0)
                        {
                            break;
                        }
                    }
                }

                if (birthDateFound.Length > 4)
                {
                    personData = Tools.SetTemplateParameterValue(personData, "DATE OF BIRTH", Tools.ConvertDate(birthDateFound, DeterminePredominantDateLocale(articletext, true)), false);
                }

                if (deathDateFound.Length > 4)
                {
                    personData = Tools.SetTemplateParameterValue(personData, "DATE OF DEATH", Tools.ConvertDate(deathDateFound, DeterminePredominantDateLocale(articletext, true)), false);
                }
            }

            return(personData);
        }