Exemple #1
0
        /// <summary>
        /// Попробовать восстановить последовательность, обрамляемую кавычками или скобками. Поддерживается
        /// вложенность, возможность отсутствия закрывающего элемента и др.
        /// </summary>
        /// <param name="t">начальный токен</param>
        /// <param name="attrs">параметры выделения</param>
        /// <param name="maxTokens">максимально токенов (вдруг забыли закрывающую кавычку)</param>
        /// <return>метатокен BracketSequenceToken</return>
        public static BracketSequenceToken TryParse(Pullenti.Ner.Token t, BracketParseAttr attrs = BracketParseAttr.No, int maxTokens = 100)
        {
            Pullenti.Ner.Token t0 = t;
            int cou = 0;

            if (!CanBeStartOfSequence(t0, false, false))
            {
                return(null);
            }
            List <Bracket> brList = new List <Bracket>();

            brList.Add(new Bracket(t0));
            cou = 0;
            int crlf = 0;

            Pullenti.Ner.Token last = null;
            int  lev     = 1;
            bool isAssim = brList[0].Char != '«' && m_AssymOPenChars.IndexOf(brList[0].Char) >= 0;
            bool genCase = false;

            for (t = t0.Next; t != null; t = t.Next)
            {
                if (t.IsTableControlChar)
                {
                    break;
                }
                last = t;
                if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars))
                {
                    if (t.IsNewlineBefore && ((attrs & BracketParseAttr.CanBeManyLines)) == BracketParseAttr.No)
                    {
                        if (t.WhitespacesBeforeCount > 10 || CanBeStartOfSequence(t, false, false))
                        {
                            if (t.IsChar('(') && !t0.IsChar('('))
                            {
                            }
                            else
                            {
                                last = t.Previous;
                                break;
                            }
                        }
                    }
                    Bracket bb = new Bracket(t);
                    brList.Add(bb);
                    if (brList.Count > 20)
                    {
                        break;
                    }
                    if ((brList.Count == 3 && brList[1].CanBeOpen && bb.CanBeClose) && MustBeCloseChar(bb.Char, brList[1].Char) && MustBeCloseChar(bb.Char, brList[0].Char))
                    {
                        bool ok = false;
                        for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next)
                        {
                            if (tt.IsNewlineBefore)
                            {
                                break;
                            }
                            if (tt.IsChar(','))
                            {
                                break;
                            }
                            if (tt.IsChar('.'))
                            {
                                for (tt = tt.Next; tt != null; tt = tt.Next)
                                {
                                    if (tt.IsNewlineBefore)
                                    {
                                        break;
                                    }
                                    else if (tt.IsCharOf(m_OpenChars) || tt.IsCharOf(m_CloseChars))
                                    {
                                        Bracket bb2 = new Bracket(tt);
                                        if (BracketHelper.CanBeEndOfSequence(tt, false, null, false) && CanBeCloseChar(bb2.Char, brList[0].Char))
                                        {
                                            ok = true;
                                        }
                                        break;
                                    }
                                }
                                break;
                            }
                            if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars))
                            {
                                ok = true;
                                break;
                            }
                        }
                        if (!ok)
                        {
                            break;
                        }
                    }
                    if (isAssim)
                    {
                        if (bb.CanBeOpen && !bb.CanBeClose && bb.Char == brList[0].Char)
                        {
                            lev++;
                        }
                        else if (bb.CanBeClose && !bb.CanBeOpen && m_OpenChars.IndexOf(brList[0].Char) == m_CloseChars.IndexOf(bb.Char))
                        {
                            lev--;
                            if (lev == 0)
                            {
                                break;
                            }
                        }
                    }
                }
                else
                {
                    if ((++cou) > maxTokens)
                    {
                        break;
                    }
                    if (((attrs & BracketParseAttr.CanContainsVerbs)) == BracketParseAttr.No)
                    {
                        if (t.Morph.Language.IsCyrillic)
                        {
                            if (t.GetMorphClassInDictionary() == Pullenti.Morph.MorphClass.Verb)
                            {
                                if (!t.Morph.Class.IsAdjective && !t.Morph.ContainsAttr("страд.з.", null))
                                {
                                    if (t.Chars.IsAllLower)
                                    {
                                        string norm = t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false);
                                        if (!Pullenti.Morph.LanguageHelper.EndsWith(norm, "СЯ"))
                                        {
                                            if (brList.Count > 1)
                                            {
                                                break;
                                            }
                                            if (brList[0].Char != '(')
                                            {
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        else if (t.Morph.Language.IsEn)
                        {
                            if (t.Morph.Class == Pullenti.Morph.MorphClass.Verb && t.Chars.IsAllLower)
                            {
                                break;
                            }
                        }
                        Pullenti.Ner.Referent r = t.GetReferent();
                        if (r != null && r.TypeName == "ADDRESS")
                        {
                            if (!t0.IsChar('('))
                            {
                                break;
                            }
                        }
                    }
                }
                if (((attrs & BracketParseAttr.CanBeManyLines)) != BracketParseAttr.No)
                {
                    if (t.IsNewlineBefore)
                    {
                        if (t.NewlinesBeforeCount > 1)
                        {
                            break;
                        }
                        crlf++;
                    }
                    continue;
                }
                if (t.IsNewlineBefore)
                {
                    if (t.WhitespacesBeforeCount > 15)
                    {
                        last = t.Previous;
                        break;
                    }
                    crlf++;
                    if (!t.Chars.IsAllLower)
                    {
                        if (MiscHelper.CanBeStartOfSentence(t))
                        {
                            bool has = false;
                            for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next)
                            {
                                if (tt.IsNewlineBefore)
                                {
                                    break;
                                }
                                else if (tt.LengthChar == 1 && tt.IsCharOf(m_OpenChars) && tt.IsWhitespaceBefore)
                                {
                                    break;
                                }
                                else if (tt.LengthChar == 1 && tt.IsCharOf(m_CloseChars) && !tt.IsWhitespaceBefore)
                                {
                                    has = true;
                                    break;
                                }
                            }
                            if (!has)
                            {
                                last = t.Previous;
                                break;
                            }
                        }
                    }
                    if ((t.Previous is Pullenti.Ner.MetaToken) && CanBeEndOfSequence((t.Previous as Pullenti.Ner.MetaToken).EndToken, false, null, false))
                    {
                        last = t.Previous;
                        break;
                    }
                }
                if (crlf > 1)
                {
                    if (brList.Count > 1)
                    {
                        break;
                    }
                    if (crlf > 10)
                    {
                        break;
                    }
                }
                if (t.IsChar(';') && t.IsNewlineAfter)
                {
                    break;
                }
                NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null);
                if (npt != null)
                {
                    if (t.IsNewlineBefore)
                    {
                        genCase = npt.Morph.Case.IsGenitive;
                    }
                    last = (t = npt.EndToken);
                }
            }
            if ((brList.Count == 1 && brList[0].CanBeOpen && (last is Pullenti.Ner.MetaToken)) && last.IsNewlineAfter)
            {
                if (BracketHelper.CanBeEndOfSequence((last as Pullenti.Ner.MetaToken).EndToken, false, null, false))
                {
                    return(new BracketSequenceToken(t0, last));
                }
            }
            if ((brList.Count == 1 && brList[0].CanBeOpen && genCase) && last.IsNewlineAfter && crlf <= 2)
            {
                return(new BracketSequenceToken(t0, last));
            }
            if (brList.Count < 1)
            {
                return(null);
            }
            for (int i = 1; i < (brList.Count - 1); i++)
            {
                if (brList[i].Char == '<' && brList[i + 1].Char == '>')
                {
                    brList[i].CanBeOpen      = true;
                    brList[i + 1].CanBeClose = true;
                }
            }
            List <BracketSequenceToken> internals = null;

            while (brList.Count > 3)
            {
                int i = brList.Count - 1;
                if ((brList[i].CanBeClose && brList[i - 1].CanBeOpen && !CanBeCloseChar(brList[i].Char, brList[0].Char)) && CanBeCloseChar(brList[i].Char, brList[i - 1].Char))
                {
                    brList.RemoveRange(brList.Count - 2, 2);
                    continue;
                }
                break;
            }
            while (brList.Count >= 4)
            {
                bool changed = false;
                for (int i = 1; i < (brList.Count - 2); i++)
                {
                    if ((brList[i].CanBeOpen && !brList[i].CanBeClose && brList[i + 1].CanBeClose) && !brList[i + 1].CanBeOpen)
                    {
                        bool ok = false;
                        if (MustBeCloseChar(brList[i + 1].Char, brList[i].Char) || brList[i].Char != brList[0].Char)
                        {
                            ok = true;
                            if ((i == 1 && ((i + 2) < brList.Count) && brList[i + 2].Char == ')') && brList[i + 1].Char != ')' && CanBeCloseChar(brList[i + 1].Char, brList[i - 1].Char))
                            {
                                brList[i + 2] = brList[i + 1];
                            }
                        }
                        else if (i > 1 && ((i + 2) < brList.Count) && MustBeCloseChar(brList[i + 2].Char, brList[i - 1].Char))
                        {
                            ok = true;
                        }
                        if (ok)
                        {
                            if (internals == null)
                            {
                                internals = new List <BracketSequenceToken>();
                            }
                            internals.Add(new BracketSequenceToken(brList[i].Source, brList[i + 1].Source));
                            brList.RemoveRange(i, 2);
                            changed = true;
                            break;
                        }
                    }
                }
                if (!changed)
                {
                    break;
                }
            }
            BracketSequenceToken res = null;

            if ((brList.Count >= 4 && brList[1].CanBeOpen && brList[2].CanBeClose) && brList[3].CanBeClose && !brList[3].CanBeOpen)
            {
                if (CanBeCloseChar(brList[3].Char, brList[0].Char))
                {
                    res = new BracketSequenceToken(brList[0].Source, brList[3].Source);
                    if (brList[0].Source.Next != brList[1].Source || brList[2].Source.Next != brList[3].Source)
                    {
                        res.Internal.Add(new BracketSequenceToken(brList[1].Source, brList[2].Source));
                    }
                    if (internals != null)
                    {
                        res.Internal.AddRange(internals);
                    }
                }
            }
            if ((res == null && brList.Count >= 3 && brList[2].CanBeClose) && !brList[2].CanBeOpen)
            {
                if (((attrs & BracketParseAttr.NearCloseBracket)) != BracketParseAttr.No)
                {
                    if (CanBeCloseChar(brList[1].Char, brList[0].Char))
                    {
                        return(new BracketSequenceToken(brList[0].Source, brList[1].Source));
                    }
                }
                bool ok = true;
                if (CanBeCloseChar(brList[2].Char, brList[0].Char) && CanBeCloseChar(brList[1].Char, brList[0].Char) && brList[1].CanBeClose)
                {
                    for (t = brList[1].Source; t != brList[2].Source && t != null; t = t.Next)
                    {
                        if (t.IsNewlineBefore)
                        {
                            ok = false;
                            break;
                        }
                        if (t.Chars.IsLetter && t.Chars.IsAllLower)
                        {
                            ok = false;
                            break;
                        }
                        NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null);
                        if (npt != null)
                        {
                            t = npt.EndToken;
                        }
                    }
                    if (ok)
                    {
                        for (t = brList[0].Source.Next; t != brList[1].Source && t != null; t = t.Next)
                        {
                            if (t.IsNewlineBefore)
                            {
                                return(new BracketSequenceToken(brList[0].Source, t.Previous));
                            }
                        }
                    }
                    int lev1 = 0;
                    for (Pullenti.Ner.Token tt = brList[0].Source.Previous; tt != null; tt = tt.Previous)
                    {
                        if (tt.IsNewlineAfter || tt.IsTableControlChar)
                        {
                            break;
                        }
                        if (!(tt is Pullenti.Ner.TextToken))
                        {
                            continue;
                        }
                        if (tt.Chars.IsLetter || tt.LengthChar > 1)
                        {
                            continue;
                        }
                        char ch = (tt as Pullenti.Ner.TextToken).Term[0];
                        if (CanBeCloseChar(ch, brList[0].Char))
                        {
                            lev1++;
                        }
                        else if (CanBeCloseChar(brList[1].Char, ch))
                        {
                            lev1--;
                            if (lev1 < 0)
                            {
                                return(new BracketSequenceToken(brList[0].Source, brList[1].Source));
                            }
                        }
                    }
                }
                if (ok && CanBeCloseChar(brList[2].Char, brList[0].Char))
                {
                    BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source);
                    res = new BracketSequenceToken(brList[0].Source, brList[2].Source);
                    res.Internal.Add(intern);
                }
                else if (ok && CanBeCloseChar(brList[2].Char, brList[1].Char) && brList[0].CanBeOpen)
                {
                    if (CanBeCloseChar(brList[2].Char, brList[0].Char))
                    {
                        BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source);
                        res = new BracketSequenceToken(brList[0].Source, brList[2].Source);
                        res.Internal.Add(intern);
                    }
                    else if (brList.Count == 3)
                    {
                        return(null);
                    }
                }
            }
            if (res == null && brList.Count > 1 && brList[1].CanBeClose)
            {
                res = new BracketSequenceToken(brList[0].Source, brList[1].Source);
            }
            if (res == null && brList.Count > 1 && CanBeCloseChar(brList[1].Char, brList[0].Char))
            {
                res = new BracketSequenceToken(brList[0].Source, brList[1].Source);
            }
            if (res == null && brList.Count == 2 && brList[0].Char == brList[1].Char)
            {
                res = new BracketSequenceToken(brList[0].Source, brList[1].Source);
            }
            if (res != null && internals != null)
            {
                foreach (BracketSequenceToken i in internals)
                {
                    if (i.BeginChar < res.EndChar)
                    {
                        res.Internal.Add(i);
                    }
                }
            }
            if (res == null)
            {
                cou = 0;
                for (Pullenti.Ner.Token tt = t0.Next; tt != null; tt = tt.Next, cou++)
                {
                    if (tt.IsTableControlChar)
                    {
                        break;
                    }
                    if (MiscHelper.CanBeStartOfSentence(tt))
                    {
                        break;
                    }
                    if (maxTokens > 0 && cou > maxTokens)
                    {
                        break;
                    }
                    Pullenti.Ner.MetaToken mt = tt as Pullenti.Ner.MetaToken;
                    if (mt == null)
                    {
                        continue;
                    }
                    if (mt.EndToken is Pullenti.Ner.TextToken)
                    {
                        if ((mt.EndToken as Pullenti.Ner.TextToken).IsCharOf(m_CloseChars))
                        {
                            Bracket bb = new Bracket(mt.EndToken as Pullenti.Ner.TextToken);
                            if (bb.CanBeClose && CanBeCloseChar(bb.Char, brList[0].Char))
                            {
                                return(new BracketSequenceToken(t0, tt));
                            }
                        }
                    }
                }
            }
            return(res);
        }
Exemple #2
0
        public static string GetNameEx(Pullenti.Ner.Token begin, Pullenti.Ner.Token end, Pullenti.Morph.MorphClass cla, Pullenti.Morph.MorphCase mc, Pullenti.Morph.MorphGender gender = Pullenti.Morph.MorphGender.Undefined, bool ignoreBracketsAndHiphens = false, bool ignoreGeoReferent = false)
        {
            if (end == null || begin == null)
            {
                return(null);
            }
            if (begin.EndChar > end.BeginChar && begin != end)
            {
                return(null);
            }
            StringBuilder res    = new StringBuilder();
            string        prefix = null;

            for (Pullenti.Ner.Token t = begin; t != null && t.EndChar <= end.EndChar; t = t.Next)
            {
                if (res.Length > 1000)
                {
                    break;
                }
                if (t.IsTableControlChar)
                {
                    continue;
                }
                if (ignoreBracketsAndHiphens)
                {
                    if (BracketHelper.IsBracket(t, false))
                    {
                        if (t == end)
                        {
                            break;
                        }
                        if (t.IsCharOf("(<["))
                        {
                            BracketSequenceToken br = BracketHelper.TryParse(t, BracketParseAttr.No, 100);
                            if (br != null && br.EndChar <= end.EndChar)
                            {
                                string tmp = GetNameEx(br.BeginToken.Next, br.EndToken.Previous, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, ignoreBracketsAndHiphens, false);
                                if (tmp != null)
                                {
                                    if ((br.EndChar == end.EndChar && br.BeginToken.Next == br.EndToken.Previous && !br.BeginToken.Next.Chars.IsLetter) && !(br.BeginToken.Next is Pullenti.Ner.ReferentToken))
                                    {
                                    }
                                    else
                                    {
                                        res.AppendFormat(" {0}{1}{2}", t.GetSourceText(), tmp, br.EndToken.GetSourceText());
                                    }
                                }
                                t = br.EndToken;
                            }
                        }
                        continue;
                    }
                    if (t.IsHiphen)
                    {
                        if (t == end)
                        {
                            break;
                        }
                        else if (t.IsWhitespaceBefore || t.IsWhitespaceAfter)
                        {
                            continue;
                        }
                    }
                }
                Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken;
                if (tt != null)
                {
                    if (!ignoreBracketsAndHiphens)
                    {
                        if ((tt.Next != null && tt.Next.IsHiphen && (tt.Next.Next is Pullenti.Ner.TextToken)) && tt != end && tt.Next != end)
                        {
                            if (prefix == null)
                            {
                                prefix = tt.Term;
                            }
                            else
                            {
                                prefix = string.Format("{0}-{1}", prefix, tt.Term);
                            }
                            t = tt.Next;
                            if (t == end)
                            {
                                break;
                            }
                            else
                            {
                                continue;
                            }
                        }
                    }
                    string s = null;
                    if (cla.Value != 0 || !mc.IsUndefined || gender != Pullenti.Morph.MorphGender.Undefined)
                    {
                        foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items)
                        {
                            Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm;
                            if (wf == null)
                            {
                                continue;
                            }
                            if (cla.Value != 0)
                            {
                                if (((wf.Class.Value & cla.Value)) == 0)
                                {
                                    continue;
                                }
                            }
                            if (!mc.IsUndefined)
                            {
                                if (((wf.Case & mc)).IsUndefined)
                                {
                                    continue;
                                }
                            }
                            if (gender != Pullenti.Morph.MorphGender.Undefined)
                            {
                                if (((wf.Gender & gender)) == Pullenti.Morph.MorphGender.Undefined)
                                {
                                    continue;
                                }
                            }
                            if (s == null || wf.NormalCase == tt.Term)
                            {
                                s = wf.NormalCase;
                            }
                        }
                        if (s == null && gender != Pullenti.Morph.MorphGender.Undefined)
                        {
                            foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items)
                            {
                                Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm;
                                if (wf == null)
                                {
                                    continue;
                                }
                                if (cla.Value != 0)
                                {
                                    if (((wf.Class.Value & cla.Value)) == 0)
                                    {
                                        continue;
                                    }
                                }
                                if (!mc.IsUndefined)
                                {
                                    if (((wf.Case & mc)).IsUndefined)
                                    {
                                        continue;
                                    }
                                }
                                if (s == null || wf.NormalCase == tt.Term)
                                {
                                    s = wf.NormalCase;
                                }
                            }
                        }
                    }
                    if (s == null)
                    {
                        s = tt.Term;
                        if (tt.Chars.IsLastLower && tt.LengthChar > 2)
                        {
                            s = tt.GetSourceText();
                            for (int i = s.Length - 1; i >= 0; i--)
                            {
                                if (char.IsUpper(s[i]))
                                {
                                    s = s.Substring(0, i + 1);
                                    break;
                                }
                            }
                        }
                    }
                    if (prefix != null)
                    {
                        string delim = "-";
                        if (ignoreBracketsAndHiphens)
                        {
                            delim = " ";
                        }
                        s = string.Format("{0}{1}{2}", prefix, delim, s);
                    }
                    prefix = null;
                    if (res.Length > 0 && s.Length > 0)
                    {
                        if (char.IsLetterOrDigit(s[0]))
                        {
                            char ch0 = res[res.Length - 1];
                            if (ch0 == '-')
                            {
                            }
                            else
                            {
                                res.Append(' ');
                            }
                        }
                        else if (!ignoreBracketsAndHiphens && BracketHelper.CanBeStartOfSequence(tt, false, false))
                        {
                            res.Append(' ');
                        }
                    }
                    res.Append(s);
                }
                else if (t is Pullenti.Ner.NumberToken)
                {
                    if (res.Length > 0)
                    {
                        if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-')
                        {
                        }
                        else
                        {
                            res.Append(' ');
                        }
                    }
                    Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken;
                    if ((t.Morph.Class.IsAdjective && nt.Typ == Pullenti.Ner.NumberSpellingType.Words && nt.BeginToken == nt.EndToken) && (nt.BeginToken is Pullenti.Ner.TextToken))
                    {
                        res.Append((nt.BeginToken as Pullenti.Ner.TextToken).Term);
                    }
                    else
                    {
                        res.Append(nt.Value);
                    }
                }
                else if (t is Pullenti.Ner.MetaToken)
                {
                    if ((ignoreGeoReferent && t != begin && t.GetReferent() != null) && t.GetReferent().TypeName == "GEO")
                    {
                        continue;
                    }
                    string s = GetNameEx((t as Pullenti.Ner.MetaToken).BeginToken, (t as Pullenti.Ner.MetaToken).EndToken, cla, mc, gender, ignoreBracketsAndHiphens, ignoreGeoReferent);
                    if (!string.IsNullOrEmpty(s))
                    {
                        if (res.Length > 0)
                        {
                            if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-')
                            {
                            }
                            else
                            {
                                res.Append(' ');
                            }
                        }
                        res.Append(s);
                    }
                }
                if (t == end)
                {
                    break;
                }
            }
            if (res.Length == 0)
            {
                return(null);
            }
            return(res.ToString());
        }