/// <summary> /// Попробовать восстановить последовательность, обрамляемую кавычками или скобками. Поддерживается /// вложенность, возможность отсутствия закрывающего элемента и др. /// </summary> /// <param name="t">начальный токен</param> /// <param name="attrs">параметры выделения</param> /// <param name="maxTokens">максимально токенов (вдруг забыли закрывающую кавычку)</param> /// <return>метатокен BracketSequenceToken</return> public static BracketSequenceToken TryParse(Pullenti.Ner.Token t, BracketParseAttr attrs = BracketParseAttr.No, int maxTokens = 100) { Pullenti.Ner.Token t0 = t; int cou = 0; if (!CanBeStartOfSequence(t0, false, false)) { return(null); } List <Bracket> brList = new List <Bracket>(); brList.Add(new Bracket(t0)); cou = 0; int crlf = 0; Pullenti.Ner.Token last = null; int lev = 1; bool isAssim = brList[0].Char != '«' && m_AssymOPenChars.IndexOf(brList[0].Char) >= 0; bool genCase = false; for (t = t0.Next; t != null; t = t.Next) { if (t.IsTableControlChar) { break; } last = t; if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars)) { if (t.IsNewlineBefore && ((attrs & BracketParseAttr.CanBeManyLines)) == BracketParseAttr.No) { if (t.WhitespacesBeforeCount > 10 || CanBeStartOfSequence(t, false, false)) { if (t.IsChar('(') && !t0.IsChar('(')) { } else { last = t.Previous; break; } } } Bracket bb = new Bracket(t); brList.Add(bb); if (brList.Count > 20) { break; } if ((brList.Count == 3 && brList[1].CanBeOpen && bb.CanBeClose) && MustBeCloseChar(bb.Char, brList[1].Char) && MustBeCloseChar(bb.Char, brList[0].Char)) { bool ok = false; for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } if (tt.IsChar(',')) { break; } if (tt.IsChar('.')) { for (tt = tt.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } else if (tt.IsCharOf(m_OpenChars) || tt.IsCharOf(m_CloseChars)) { Bracket bb2 = new Bracket(tt); if (BracketHelper.CanBeEndOfSequence(tt, false, null, false) && CanBeCloseChar(bb2.Char, brList[0].Char)) { ok = true; } break; } } break; } if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars)) { ok = true; break; } } if (!ok) { break; } } if (isAssim) { if (bb.CanBeOpen && !bb.CanBeClose && bb.Char == brList[0].Char) { lev++; } else if (bb.CanBeClose && !bb.CanBeOpen && m_OpenChars.IndexOf(brList[0].Char) == m_CloseChars.IndexOf(bb.Char)) { lev--; if (lev == 0) { break; } } } } else { if ((++cou) > maxTokens) { break; } if (((attrs & BracketParseAttr.CanContainsVerbs)) == BracketParseAttr.No) { if (t.Morph.Language.IsCyrillic) { if (t.GetMorphClassInDictionary() == Pullenti.Morph.MorphClass.Verb) { if (!t.Morph.Class.IsAdjective && !t.Morph.ContainsAttr("страд.з.", null)) { if (t.Chars.IsAllLower) { string norm = t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); if (!Pullenti.Morph.LanguageHelper.EndsWith(norm, "СЯ")) { if (brList.Count > 1) { break; } if (brList[0].Char != '(') { break; } } } } } } else if (t.Morph.Language.IsEn) { if (t.Morph.Class == Pullenti.Morph.MorphClass.Verb && t.Chars.IsAllLower) { break; } } Pullenti.Ner.Referent r = t.GetReferent(); if (r != null && r.TypeName == "ADDRESS") { if (!t0.IsChar('(')) { break; } } } } if (((attrs & BracketParseAttr.CanBeManyLines)) != BracketParseAttr.No) { if (t.IsNewlineBefore) { if (t.NewlinesBeforeCount > 1) { break; } crlf++; } continue; } if (t.IsNewlineBefore) { if (t.WhitespacesBeforeCount > 15) { last = t.Previous; break; } crlf++; if (!t.Chars.IsAllLower) { if (MiscHelper.CanBeStartOfSentence(t)) { bool has = false; for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } else if (tt.LengthChar == 1 && tt.IsCharOf(m_OpenChars) && tt.IsWhitespaceBefore) { break; } else if (tt.LengthChar == 1 && tt.IsCharOf(m_CloseChars) && !tt.IsWhitespaceBefore) { has = true; break; } } if (!has) { last = t.Previous; break; } } } if ((t.Previous is Pullenti.Ner.MetaToken) && CanBeEndOfSequence((t.Previous as Pullenti.Ner.MetaToken).EndToken, false, null, false)) { last = t.Previous; break; } } if (crlf > 1) { if (brList.Count > 1) { break; } if (crlf > 10) { break; } } if (t.IsChar(';') && t.IsNewlineAfter) { break; } NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null); if (npt != null) { if (t.IsNewlineBefore) { genCase = npt.Morph.Case.IsGenitive; } last = (t = npt.EndToken); } } if ((brList.Count == 1 && brList[0].CanBeOpen && (last is Pullenti.Ner.MetaToken)) && last.IsNewlineAfter) { if (BracketHelper.CanBeEndOfSequence((last as Pullenti.Ner.MetaToken).EndToken, false, null, false)) { return(new BracketSequenceToken(t0, last)); } } if ((brList.Count == 1 && brList[0].CanBeOpen && genCase) && last.IsNewlineAfter && crlf <= 2) { return(new BracketSequenceToken(t0, last)); } if (brList.Count < 1) { return(null); } for (int i = 1; i < (brList.Count - 1); i++) { if (brList[i].Char == '<' && brList[i + 1].Char == '>') { brList[i].CanBeOpen = true; brList[i + 1].CanBeClose = true; } } List <BracketSequenceToken> internals = null; while (brList.Count > 3) { int i = brList.Count - 1; if ((brList[i].CanBeClose && brList[i - 1].CanBeOpen && !CanBeCloseChar(brList[i].Char, brList[0].Char)) && CanBeCloseChar(brList[i].Char, brList[i - 1].Char)) { brList.RemoveRange(brList.Count - 2, 2); continue; } break; } while (brList.Count >= 4) { bool changed = false; for (int i = 1; i < (brList.Count - 2); i++) { if ((brList[i].CanBeOpen && !brList[i].CanBeClose && brList[i + 1].CanBeClose) && !brList[i + 1].CanBeOpen) { bool ok = false; if (MustBeCloseChar(brList[i + 1].Char, brList[i].Char) || brList[i].Char != brList[0].Char) { ok = true; if ((i == 1 && ((i + 2) < brList.Count) && brList[i + 2].Char == ')') && brList[i + 1].Char != ')' && CanBeCloseChar(brList[i + 1].Char, brList[i - 1].Char)) { brList[i + 2] = brList[i + 1]; } } else if (i > 1 && ((i + 2) < brList.Count) && MustBeCloseChar(brList[i + 2].Char, brList[i - 1].Char)) { ok = true; } if (ok) { if (internals == null) { internals = new List <BracketSequenceToken>(); } internals.Add(new BracketSequenceToken(brList[i].Source, brList[i + 1].Source)); brList.RemoveRange(i, 2); changed = true; break; } } } if (!changed) { break; } } BracketSequenceToken res = null; if ((brList.Count >= 4 && brList[1].CanBeOpen && brList[2].CanBeClose) && brList[3].CanBeClose && !brList[3].CanBeOpen) { if (CanBeCloseChar(brList[3].Char, brList[0].Char)) { res = new BracketSequenceToken(brList[0].Source, brList[3].Source); if (brList[0].Source.Next != brList[1].Source || brList[2].Source.Next != brList[3].Source) { res.Internal.Add(new BracketSequenceToken(brList[1].Source, brList[2].Source)); } if (internals != null) { res.Internal.AddRange(internals); } } } if ((res == null && brList.Count >= 3 && brList[2].CanBeClose) && !brList[2].CanBeOpen) { if (((attrs & BracketParseAttr.NearCloseBracket)) != BracketParseAttr.No) { if (CanBeCloseChar(brList[1].Char, brList[0].Char)) { return(new BracketSequenceToken(brList[0].Source, brList[1].Source)); } } bool ok = true; if (CanBeCloseChar(brList[2].Char, brList[0].Char) && CanBeCloseChar(brList[1].Char, brList[0].Char) && brList[1].CanBeClose) { for (t = brList[1].Source; t != brList[2].Source && t != null; t = t.Next) { if (t.IsNewlineBefore) { ok = false; break; } if (t.Chars.IsLetter && t.Chars.IsAllLower) { ok = false; break; } NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null); if (npt != null) { t = npt.EndToken; } } if (ok) { for (t = brList[0].Source.Next; t != brList[1].Source && t != null; t = t.Next) { if (t.IsNewlineBefore) { return(new BracketSequenceToken(brList[0].Source, t.Previous)); } } } int lev1 = 0; for (Pullenti.Ner.Token tt = brList[0].Source.Previous; tt != null; tt = tt.Previous) { if (tt.IsNewlineAfter || tt.IsTableControlChar) { break; } if (!(tt is Pullenti.Ner.TextToken)) { continue; } if (tt.Chars.IsLetter || tt.LengthChar > 1) { continue; } char ch = (tt as Pullenti.Ner.TextToken).Term[0]; if (CanBeCloseChar(ch, brList[0].Char)) { lev1++; } else if (CanBeCloseChar(brList[1].Char, ch)) { lev1--; if (lev1 < 0) { return(new BracketSequenceToken(brList[0].Source, brList[1].Source)); } } } } if (ok && CanBeCloseChar(brList[2].Char, brList[0].Char)) { BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source); res = new BracketSequenceToken(brList[0].Source, brList[2].Source); res.Internal.Add(intern); } else if (ok && CanBeCloseChar(brList[2].Char, brList[1].Char) && brList[0].CanBeOpen) { if (CanBeCloseChar(brList[2].Char, brList[0].Char)) { BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source); res = new BracketSequenceToken(brList[0].Source, brList[2].Source); res.Internal.Add(intern); } else if (brList.Count == 3) { return(null); } } } if (res == null && brList.Count > 1 && brList[1].CanBeClose) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res == null && brList.Count > 1 && CanBeCloseChar(brList[1].Char, brList[0].Char)) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res == null && brList.Count == 2 && brList[0].Char == brList[1].Char) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res != null && internals != null) { foreach (BracketSequenceToken i in internals) { if (i.BeginChar < res.EndChar) { res.Internal.Add(i); } } } if (res == null) { cou = 0; for (Pullenti.Ner.Token tt = t0.Next; tt != null; tt = tt.Next, cou++) { if (tt.IsTableControlChar) { break; } if (MiscHelper.CanBeStartOfSentence(tt)) { break; } if (maxTokens > 0 && cou > maxTokens) { break; } Pullenti.Ner.MetaToken mt = tt as Pullenti.Ner.MetaToken; if (mt == null) { continue; } if (mt.EndToken is Pullenti.Ner.TextToken) { if ((mt.EndToken as Pullenti.Ner.TextToken).IsCharOf(m_CloseChars)) { Bracket bb = new Bracket(mt.EndToken as Pullenti.Ner.TextToken); if (bb.CanBeClose && CanBeCloseChar(bb.Char, brList[0].Char)) { return(new BracketSequenceToken(t0, tt)); } } } } } return(res); }
public static string GetNameEx(Pullenti.Ner.Token begin, Pullenti.Ner.Token end, Pullenti.Morph.MorphClass cla, Pullenti.Morph.MorphCase mc, Pullenti.Morph.MorphGender gender = Pullenti.Morph.MorphGender.Undefined, bool ignoreBracketsAndHiphens = false, bool ignoreGeoReferent = false) { if (end == null || begin == null) { return(null); } if (begin.EndChar > end.BeginChar && begin != end) { return(null); } StringBuilder res = new StringBuilder(); string prefix = null; for (Pullenti.Ner.Token t = begin; t != null && t.EndChar <= end.EndChar; t = t.Next) { if (res.Length > 1000) { break; } if (t.IsTableControlChar) { continue; } if (ignoreBracketsAndHiphens) { if (BracketHelper.IsBracket(t, false)) { if (t == end) { break; } if (t.IsCharOf("(<[")) { BracketSequenceToken br = BracketHelper.TryParse(t, BracketParseAttr.No, 100); if (br != null && br.EndChar <= end.EndChar) { string tmp = GetNameEx(br.BeginToken.Next, br.EndToken.Previous, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, ignoreBracketsAndHiphens, false); if (tmp != null) { if ((br.EndChar == end.EndChar && br.BeginToken.Next == br.EndToken.Previous && !br.BeginToken.Next.Chars.IsLetter) && !(br.BeginToken.Next is Pullenti.Ner.ReferentToken)) { } else { res.AppendFormat(" {0}{1}{2}", t.GetSourceText(), tmp, br.EndToken.GetSourceText()); } } t = br.EndToken; } } continue; } if (t.IsHiphen) { if (t == end) { break; } else if (t.IsWhitespaceBefore || t.IsWhitespaceAfter) { continue; } } } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { if (!ignoreBracketsAndHiphens) { if ((tt.Next != null && tt.Next.IsHiphen && (tt.Next.Next is Pullenti.Ner.TextToken)) && tt != end && tt.Next != end) { if (prefix == null) { prefix = tt.Term; } else { prefix = string.Format("{0}-{1}", prefix, tt.Term); } t = tt.Next; if (t == end) { break; } else { continue; } } } string s = null; if (cla.Value != 0 || !mc.IsUndefined || gender != Pullenti.Morph.MorphGender.Undefined) { foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items) { Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm; if (wf == null) { continue; } if (cla.Value != 0) { if (((wf.Class.Value & cla.Value)) == 0) { continue; } } if (!mc.IsUndefined) { if (((wf.Case & mc)).IsUndefined) { continue; } } if (gender != Pullenti.Morph.MorphGender.Undefined) { if (((wf.Gender & gender)) == Pullenti.Morph.MorphGender.Undefined) { continue; } } if (s == null || wf.NormalCase == tt.Term) { s = wf.NormalCase; } } if (s == null && gender != Pullenti.Morph.MorphGender.Undefined) { foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items) { Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm; if (wf == null) { continue; } if (cla.Value != 0) { if (((wf.Class.Value & cla.Value)) == 0) { continue; } } if (!mc.IsUndefined) { if (((wf.Case & mc)).IsUndefined) { continue; } } if (s == null || wf.NormalCase == tt.Term) { s = wf.NormalCase; } } } } if (s == null) { s = tt.Term; if (tt.Chars.IsLastLower && tt.LengthChar > 2) { s = tt.GetSourceText(); for (int i = s.Length - 1; i >= 0; i--) { if (char.IsUpper(s[i])) { s = s.Substring(0, i + 1); break; } } } } if (prefix != null) { string delim = "-"; if (ignoreBracketsAndHiphens) { delim = " "; } s = string.Format("{0}{1}{2}", prefix, delim, s); } prefix = null; if (res.Length > 0 && s.Length > 0) { if (char.IsLetterOrDigit(s[0])) { char ch0 = res[res.Length - 1]; if (ch0 == '-') { } else { res.Append(' '); } } else if (!ignoreBracketsAndHiphens && BracketHelper.CanBeStartOfSequence(tt, false, false)) { res.Append(' '); } } res.Append(s); } else if (t is Pullenti.Ner.NumberToken) { if (res.Length > 0) { if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-') { } else { res.Append(' '); } } Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if ((t.Morph.Class.IsAdjective && nt.Typ == Pullenti.Ner.NumberSpellingType.Words && nt.BeginToken == nt.EndToken) && (nt.BeginToken is Pullenti.Ner.TextToken)) { res.Append((nt.BeginToken as Pullenti.Ner.TextToken).Term); } else { res.Append(nt.Value); } } else if (t is Pullenti.Ner.MetaToken) { if ((ignoreGeoReferent && t != begin && t.GetReferent() != null) && t.GetReferent().TypeName == "GEO") { continue; } string s = GetNameEx((t as Pullenti.Ner.MetaToken).BeginToken, (t as Pullenti.Ner.MetaToken).EndToken, cla, mc, gender, ignoreBracketsAndHiphens, ignoreGeoReferent); if (!string.IsNullOrEmpty(s)) { if (res.Length > 0) { if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-') { } else { res.Append(' '); } } res.Append(s); } } if (t == end) { break; } } if (res.Length == 0) { return(null); } return(res.ToString()); }