/// <summary> /// Попытаться выделить предлог с указанного токена /// </summary> /// <param name="t">начальный токен</param> /// <return>результат или null</return> public static PrepositionToken TryParse(Pullenti.Ner.Token t) { if (!(t is Pullenti.Ner.TextToken)) { return(null); } TerminToken tok = m_Ontology.TryParse(t, TerminParseAttr.No); if (tok != null) { return new PrepositionToken(t, tok.EndToken) { Normal = tok.Termin.CanonicText, NextCase = (Pullenti.Morph.MorphCase)tok.Termin.Tag } } ; Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (!mc.IsPreposition) { return(null); } PrepositionToken res = new PrepositionToken(t, t); res.Normal = t.GetNormalCaseText(Pullenti.Morph.MorphClass.Preposition, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); res.NextCase = Pullenti.Morph.LanguageHelper.GetCaseAfterPreposition(res.Normal); if ((t.Next != null && t.Next.IsHiphen && !t.IsWhitespaceAfter) && (t.Next.Next is Pullenti.Ner.TextToken) && t.Next.Next.GetMorphClassInDictionary().IsPreposition) { res.EndToken = t.Next.Next; } return(res); }
public NounPhraseItemTextVar(Pullenti.Morph.MorphBaseInfo src = null, Pullenti.Ner.Token t = null) : base() { if (src != null) { this.CopyFrom(src); } Pullenti.Morph.MorphWordForm wf = src as Pullenti.Morph.MorphWordForm; if (wf != null) { NormalValue = wf.NormalCase; if (wf.Number == Pullenti.Morph.MorphNumber.Plural && wf.NormalFull != null) { SingleNumberValue = wf.NormalFull; } UndefCoef = wf.UndefCoef; } else if (t != null) { NormalValue = t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); } if (Case.IsUndefined && src != null) { if (src.ContainsAttr("неизм.", null)) { Case = Pullenti.Morph.MorphCase.AllCases; } } }
public override string ToString() { string res = UnknownName ?? ((ExtOnto == null ? Unit.ToString() : ExtOnto.ToString())); if (Pow != 1) { res = string.Format("{0}<{1}>", res, Pow); } if (IsDoubt) { res += "?"; } if (Keyword != null) { res = string.Format("{0} (<-{1})", res, Keyword.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false)); } return(res); }
public static bool CheckKeyword(Pullenti.Ner.Measure.MeasureKind ki, Pullenti.Ner.Token t) { if (t == null || ki == Pullenti.Ner.Measure.MeasureKind.Undefined) { return(false); } if (t is Pullenti.Ner.MetaToken) { for (Pullenti.Ner.Token tt = (t as Pullenti.Ner.MetaToken).BeginToken; tt != null && tt.EndChar <= t.EndChar; tt = tt.Next) { if (CheckKeyword(ki, tt)) { return(true); } } return(false); } if (!(t is Pullenti.Ner.TextToken)) { return(false); } string term = t.GetNormalCaseText(Pullenti.Morph.MorphClass.Noun, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); foreach (Unit u in Units) { if (u.Kind == ki) { if (u.Keywords.Contains(term)) { return(true); } } } if (m_KindsKeywords.ContainsKey(ki)) { if (m_KindsKeywords[ki].Contains(term)) { return(true); } } return(false); }
public static NamedItemToken TryParse(Pullenti.Ner.Token t, Pullenti.Ner.Core.IntOntologyCollection locOnto) { if (t == null) { return(null); } if (t is Pullenti.Ner.ReferentToken) { Pullenti.Ner.Referent r = t.GetReferent(); if ((r.TypeName == "PERSON" || r.TypeName == "PERSONPROPERTY" || (r is Pullenti.Ner.Geo.GeoReferent)) || r.TypeName == "ORGANIZATION") { return new NamedItemToken(t, t) { Ref = r, Morph = t.Morph } } ; return(null); } Pullenti.Ner.Core.TerminToken typ = m_Types.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); Pullenti.Ner.Core.TerminToken nam = m_Names.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (typ != null) { if (!(t is Pullenti.Ner.TextToken)) { return(null); } NamedItemToken res = new NamedItemToken(typ.BeginToken, typ.EndToken) { Morph = typ.Morph, Chars = typ.Chars }; res.Kind = (Pullenti.Ner.Named.NamedEntityKind)typ.Termin.Tag; res.TypeValue = typ.Termin.CanonicText; if ((nam != null && nam.EndToken == typ.EndToken && !t.Chars.IsAllLower) && ((Pullenti.Ner.Named.NamedEntityKind)nam.Termin.Tag) == res.Kind) { res.NameValue = nam.Termin.CanonicText; res.IsWellknown = true; } return(res); } if (nam != null) { if (nam.BeginToken.Chars.IsAllLower) { return(null); } NamedItemToken res = new NamedItemToken(nam.BeginToken, nam.EndToken) { Morph = nam.Morph, Chars = nam.Chars }; res.Kind = (Pullenti.Ner.Named.NamedEntityKind)nam.Termin.Tag; res.NameValue = nam.Termin.CanonicText; bool ok = true; if (!t.IsWhitespaceBefore && t.Previous != null) { ok = false; } else if (!t.IsWhitespaceAfter && t.Next != null) { if (t.Next.IsCharOf(",.;!?") && t.Next.IsWhitespaceAfter) { } else { ok = false; } } if (ok) { res.IsWellknown = true; res.TypeValue = nam.Termin.Tag2 as string; } return(res); } Pullenti.Ner.MetaToken adj = Pullenti.Ner.Geo.Internal.MiscLocationHelper.TryAttachNordWest(t); if (adj != null) { if (adj.Morph.Class.IsNoun) { if (adj.EndToken.IsValue("ВОСТОК", null)) { if (adj.BeginToken == adj.EndToken) { return(null); } NamedItemToken re = new NamedItemToken(t, adj.EndToken) { Morph = adj.Morph }; re.Kind = Pullenti.Ner.Named.NamedEntityKind.Location; re.NameValue = Pullenti.Ner.Core.MiscHelper.GetTextValue(t, adj.EndToken, Pullenti.Ner.Core.GetTextAttr.FirstNounGroupToNominative); re.IsWellknown = true; return(re); } return(null); } if (adj.WhitespacesAfterCount > 2) { return(null); } if ((adj.EndToken.Next is Pullenti.Ner.ReferentToken) && (adj.EndToken.Next.GetReferent() is Pullenti.Ner.Geo.GeoReferent)) { NamedItemToken re = new NamedItemToken(t, adj.EndToken.Next) { Morph = adj.EndToken.Next.Morph }; re.Kind = Pullenti.Ner.Named.NamedEntityKind.Location; re.NameValue = Pullenti.Ner.Core.MiscHelper.GetTextValue(t, adj.EndToken.Next, Pullenti.Ner.Core.GetTextAttr.FirstNounGroupToNominative); re.IsWellknown = true; re.Ref = adj.EndToken.Next.GetReferent(); return(re); } NamedItemToken res = TryParse(adj.EndToken.Next, locOnto); if (res != null && res.Kind == Pullenti.Ner.Named.NamedEntityKind.Location) { string s = adj.GetNormalCaseText(Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphNumber.Singular, res.Morph.Gender, false); if (s != null) { if (res.NameValue == null) { res.NameValue = s.ToUpper(); } else { res.NameValue = string.Format("{0} {1}", s.ToUpper(), res.NameValue); res.TypeValue = null; } res.BeginToken = t; res.Chars = t.Chars; res.IsWellknown = true; return(res); } } } if (t.Chars.IsCapitalUpper && !Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(t)) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.Adjectives.Count > 0) { NamedItemToken test = TryParse(npt.Noun.BeginToken, locOnto); if (test != null && test.EndToken == npt.EndToken && test.TypeValue != null) { test.BeginToken = t; StringBuilder tmp = new StringBuilder(); foreach (Pullenti.Ner.MetaToken a in npt.Adjectives) { string s = a.GetNormalCaseText(Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphNumber.Singular, test.Morph.Gender, false); if (tmp.Length > 0) { tmp.Append(' '); } tmp.Append(s); } test.NameValue = tmp.ToString(); test.Chars = t.Chars; if (test.Kind == Pullenti.Ner.Named.NamedEntityKind.Location) { test.IsWellknown = true; } return(test); } } } if ((Pullenti.Ner.Core.BracketHelper.IsBracket(t, true) && t.Next != null && t.Next.Chars.IsLetter) && !t.Next.Chars.IsAllLower) { Pullenti.Ner.Core.BracketSequenceToken br = Pullenti.Ner.Core.BracketHelper.TryParse(t, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (br != null) { NamedItemToken res = new NamedItemToken(t, br.EndToken); res.IsInBracket = true; res.NameValue = Pullenti.Ner.Core.MiscHelper.GetTextValue(t, br.EndToken, Pullenti.Ner.Core.GetTextAttr.No); nam = m_Names.TryParse(t.Next, Pullenti.Ner.Core.TerminParseAttr.No); if (nam != null && nam.EndToken == br.EndToken.Previous) { res.Kind = (Pullenti.Ner.Named.NamedEntityKind)nam.Termin.Tag; res.IsWellknown = true; res.NameValue = nam.Termin.CanonicText; } return(res); } } if (((t is Pullenti.Ner.TextToken) && t.Chars.IsLetter && !t.Chars.IsAllLower) && t.LengthChar > 2) { NamedItemToken res = new NamedItemToken(t, t) { Morph = t.Morph }; string str = (t as Pullenti.Ner.TextToken).Term; if (str.EndsWith("О") || str.EndsWith("И") || str.EndsWith("Ы")) { res.NameValue = str; } else { res.NameValue = t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); } res.Chars = t.Chars; if (((!t.IsWhitespaceAfter && t.Next != null && t.Next.IsHiphen) && (t.Next.Next is Pullenti.Ner.TextToken) && !t.Next.Next.IsWhitespaceAfter) && t.Chars.IsCyrillicLetter == t.Next.Next.Chars.IsCyrillicLetter) { t = (res.EndToken = t.Next.Next); res.NameValue = string.Format("{0}-{1}", res.NameValue, t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false)); } return(res); } return(null); }
public static Pullenti.Semantic.SemObject CreateNounGroup(Pullenti.Semantic.SemGraph gr, Pullenti.Ner.Core.NounPhraseToken npt) { Pullenti.Ner.Token noun = npt.Noun.BeginToken; Pullenti.Semantic.SemObject sem = new Pullenti.Semantic.SemObject(gr); sem.Tokens.Add(npt.Noun); sem.Typ = Pullenti.Semantic.SemObjectType.Noun; if (npt.Noun.Morph.Class.IsPersonalPronoun) { sem.Typ = Pullenti.Semantic.SemObjectType.PersonalPronoun; } else if (npt.Noun.Morph.Class.IsPronoun) { sem.Typ = Pullenti.Semantic.SemObjectType.Pronoun; } if (npt.Noun.BeginToken != npt.Noun.EndToken) { sem.Morph.NormalCase = npt.Noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); sem.Morph.NormalFull = npt.Noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); sem.Morph.Class = Pullenti.Morph.MorphClass.Noun; sem.Morph.Number = npt.Morph.Number; sem.Morph.Gender = npt.Morph.Gender; sem.Morph.Case = npt.Morph.Case; } else if (noun is Pullenti.Ner.TextToken) { foreach (Pullenti.Morph.MorphBaseInfo wf in noun.Morph.Items) { if (wf.CheckAccord(npt.Morph, false, false) && (wf is Pullenti.Morph.MorphWordForm)) { _setMorph(sem, wf as Pullenti.Morph.MorphWordForm); break; } } if (sem.Morph.NormalCase == null) { sem.Morph.NormalCase = noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); sem.Morph.NormalFull = noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); } List <Pullenti.Semantic.Utils.DerivateGroup> grs = Pullenti.Semantic.Utils.DerivateService.FindDerivates(sem.Morph.NormalFull, true, null); if (grs != null && grs.Count > 0) { sem.Concept = grs[0]; } } else if (noun is Pullenti.Ner.ReferentToken) { Pullenti.Ner.Referent r = (noun as Pullenti.Ner.ReferentToken).Referent; if (r == null) { return(null); } sem.Morph.NormalFull = (sem.Morph.NormalCase = r.ToString()); sem.Concept = r; } else if (noun is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken num = noun as Pullenti.Ner.NumberToken; sem.Morph.Gender = noun.Morph.Gender; sem.Morph.Number = noun.Morph.Number; if (num.IntValue != null) { sem.Morph.NormalCase = Pullenti.Ner.Core.NumberHelper.GetNumberAdjective(num.IntValue.Value, noun.Morph.Gender, noun.Morph.Number); sem.Morph.NormalFull = Pullenti.Ner.Core.NumberHelper.GetNumberAdjective(num.IntValue.Value, Pullenti.Morph.MorphGender.Masculine, Pullenti.Morph.MorphNumber.Singular); } else { sem.Morph.NormalFull = (sem.Morph.NormalCase = noun.GetSourceText().ToUpper()); } } noun.Tag = sem; if (npt.Adjectives.Count > 0) { foreach (Pullenti.Ner.MetaToken a in npt.Adjectives) { if (npt.MultiNouns && a != npt.Adjectives[0]) { break; } Pullenti.Semantic.SemObject asem = CreateNptAdj(gr, npt, a); if (asem != null) { gr.AddLink(Pullenti.Semantic.SemLinkType.Detail, sem, asem, "какой", false, null); } } } if (npt.InternalNoun != null) { Pullenti.Semantic.SemObject intsem = CreateNounGroup(gr, npt.InternalNoun); if (intsem != null) { gr.AddLink(Pullenti.Semantic.SemLinkType.Detail, sem, intsem, null, false, null); } } gr.Objects.Add(sem); return(sem); }
static VerbPhraseToken TryParseRu(Pullenti.Ner.Token t, bool canBePartition, bool canBeAdjPartition, bool forceParse) { VerbPhraseToken res = null; Pullenti.Ner.Token t0 = t; Pullenti.Ner.Token not = null; bool hasVerb = false; bool verbBeBefore = false; PrepositionToken prep = null; for (; t != null; t = t.Next) { if (!(t is Pullenti.Ner.TextToken)) { break; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; bool isParticiple = false; if (tt.Term == "НЕ") { not = t; continue; } int ty = 0; string norm = null; Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if (tt.Term == "НЕТ") { if (hasVerb) { break; } ty = 1; } else if (tt.Term == "ДОПУСТИМО") { ty = 3; } else if (mc.IsAdverb && !mc.IsVerb) { ty = 2; } else if (tt.IsPureVerb || tt.IsVerbBe) { ty = 1; if (hasVerb) { if (!tt.Morph.ContainsAttr("инф.", null)) { if (verbBeBefore) { } else { break; } } } } else if (mc.IsVerb) { if (mc.IsPreposition || mc.IsMisc || mc.IsPronoun) { } else if (mc.IsNoun) { if (tt.Term == "СТАЛИ" || tt.Term == "СТЕКЛО" || tt.Term == "БЫЛИ") { ty = 1; } else if (!tt.Chars.IsAllLower && !MiscHelper.CanBeStartOfSentence(tt)) { ty = 1; } else if (mc.IsAdjective && canBePartition) { ty = 1; } else if (forceParse) { ty = 1; } } else if (mc.IsProper) { if (tt.Chars.IsAllLower) { ty = 1; } } else { ty = 1; } if (mc.IsAdjective) { isParticiple = true; } if (!tt.Morph.Case.IsUndefined) { isParticiple = true; } if (!canBePartition && isParticiple) { break; } if (hasVerb) { if (tt.Morph.ContainsAttr("инф.", null)) { } else if (!isParticiple) { } else { break; } } } else if ((mc.IsAdjective && tt.Morph.ContainsAttr("к.ф.", null) && tt.Term.EndsWith("О")) && NounPhraseHelper.TryParse(tt, NounPhraseParseAttr.No, 0, null) == null) { ty = 2; } else if (mc.IsAdjective && ((canBePartition || canBeAdjPartition))) { if (tt.Morph.ContainsAttr("к.ф.", null) && !canBeAdjPartition) { break; } norm = tt.GetNormalCaseText(Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Masculine, false); if (norm.EndsWith("ЙШИЙ")) { } else { List <Pullenti.Semantic.Utils.DerivateGroup> grs = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, null); if (grs != null && grs.Count > 0) { bool hVerb = false; bool hPart = false; foreach (Pullenti.Semantic.Utils.DerivateGroup gr in grs) { foreach (Pullenti.Semantic.Utils.DerivateWord w in gr.Words) { if (w.Class.IsAdjective && w.Class.IsVerb) { if (w.Spelling == norm) { hPart = true; } } else if (w.Class.IsVerb) { hVerb = true; } } } if (hPart && hVerb) { ty = 3; } else if (canBeAdjPartition) { ty = 3; } if (ty != 3 && !string.IsNullOrEmpty(grs[0].Prefix) && norm.StartsWith(grs[0].Prefix)) { hVerb = false; hPart = false; string norm1 = norm.Substring(grs[0].Prefix.Length); grs = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm1, true, null); if (grs != null && grs.Count > 0) { foreach (Pullenti.Semantic.Utils.DerivateGroup gr in grs) { foreach (Pullenti.Semantic.Utils.DerivateWord w in gr.Words) { if (w.Class.IsAdjective && w.Class.IsVerb) { if (w.Spelling == norm1) { hPart = true; } } else if (w.Class.IsVerb) { hVerb = true; } } } } if (hPart && hVerb) { ty = 3; } } } } } if (ty == 0 && t == t0 && canBePartition) { prep = PrepositionHelper.TryParse(t); if (prep != null) { t = prep.EndToken; continue; } } if (ty == 0) { break; } if (res == null) { res = new VerbPhraseToken(t0, t); } res.EndToken = t; VerbPhraseItemToken it = new VerbPhraseItemToken(t, t) { Morph = new Pullenti.Ner.MorphCollection(t.Morph) }; if (not != null) { it.BeginToken = not; it.Not = true; not = null; } it.IsAdverb = ty == 2; if (prep != null && !t.Morph.Case.IsUndefined && res.Items.Count == 0) { if (((prep.NextCase & t.Morph.Case)).IsUndefined) { return(null); } it.Morph.RemoveItems(prep.NextCase); res.Preposition = prep; } if (norm == null) { norm = t.GetNormalCaseText((ty == 3 ? Pullenti.Morph.MorphClass.Adjective : (ty == 2 ? Pullenti.Morph.MorphClass.Adverb : Pullenti.Morph.MorphClass.Verb)), Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Masculine, false); if (ty == 1 && !tt.Morph.Case.IsUndefined) { Pullenti.Morph.MorphWordForm mi = new Pullenti.Morph.MorphWordForm() { Case = Pullenti.Morph.MorphCase.Nominative, Number = Pullenti.Morph.MorphNumber.Singular, Gender = Pullenti.Morph.MorphGender.Masculine }; foreach (Pullenti.Morph.MorphBaseInfo mit in tt.Morph.Items) { if (mit is Pullenti.Morph.MorphWordForm) { mi.Misc = (mit as Pullenti.Morph.MorphWordForm).Misc; break; } } string nnn = Pullenti.Morph.MorphologyService.GetWordform("КК" + (t as Pullenti.Ner.TextToken).Term, mi); if (nnn != null) { norm = nnn.Substring(2); } } } it.Normal = norm; res.Items.Add(it); if (!hasVerb && ((ty == 1 || ty == 3))) { res.Morph = it.Morph; hasVerb = true; } if (ty == 1 || ty == 3) { if (ty == 1 && tt.IsVerbBe) { verbBeBefore = true; } else { verbBeBefore = false; } } } if (!hasVerb) { return(null); } for (int i = res.Items.Count - 1; i > 0; i--) { if (res.Items[i].IsAdverb) { res.Items.RemoveAt(i); res.EndToken = res.Items[i - 1].EndToken; } else { break; } } return(res); }
// Основная функция выделения телефонов public override void Process(Pullenti.Ner.Core.AnalysisKit kit) { Pullenti.Ner.Core.AnalyzerData ad = kit.GetAnalyzerData(this); bool hasDenoms = false; foreach (Pullenti.Ner.Analyzer a in kit.Processor.Analyzers) { if ((a is Pullenti.Ner.Denomination.DenominationAnalyzer) && !a.IgnoreThisAnalyzer) { hasDenoms = true; } } if (!hasDenoms) { Pullenti.Ner.Denomination.DenominationAnalyzer a = new Pullenti.Ner.Denomination.DenominationAnalyzer(); a.Process(kit); } List <KeywordReferent> li = new List <KeywordReferent>(); StringBuilder tmp = new StringBuilder(); List <string> tmp2 = new List <string>(); int max = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next) { max++; } int cur = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next, cur++) { Pullenti.Ner.Referent r = t.GetReferent(); if (r != null) { t = this._addReferents(ad, t, cur, max); continue; } if (!(t is Pullenti.Ner.TextToken)) { continue; } if (!t.Chars.IsLetter || (t.LengthChar < 3)) { continue; } string term = (t as Pullenti.Ner.TextToken).Term; if (term == "ЕСТЬ") { if ((t.Previous is Pullenti.Ner.TextToken) && t.Previous.Morph.Class.IsVerb) { } else { continue; } } Pullenti.Ner.Core.NounPhraseToken npt = null; npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.AdjectiveCanBeLast | Pullenti.Ner.Core.NounPhraseParseAttr.ParsePreposition, 0, null); if (npt == null) { Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsVerb && !mc.IsPreposition) { if ((t as Pullenti.Ner.TextToken).IsVerbBe) { continue; } if (t.IsValue("МОЧЬ", null) || t.IsValue("WOULD", null)) { continue; } KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Predicate }; string norm = t.GetNormalCaseText(Pullenti.Morph.MorphClass.Verb, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); if (norm == null) { norm = (t as Pullenti.Ner.TextToken).Lemma; } if (norm.EndsWith("ЬСЯ")) { norm = norm.Substring(0, norm.Length - 2); } kref.AddSlot(KeywordReferent.ATTR_VALUE, norm, false, 0); List <Pullenti.Semantic.Utils.DerivateGroup> drv = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, t.Morph.Language); _addNormals(kref, drv, norm); kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(ad.RegisterReferent(kref), t, t) { Morph = t.Morph }; kit.EmbedToken(rt1); t = rt1; continue; } continue; } if (npt.InternalNoun != null) { continue; } if (npt.EndToken.IsValue("ЦЕЛОМ", null) || npt.EndToken.IsValue("ЧАСТНОСТИ", null)) { if (npt.Preposition != null) { t = npt.EndToken; continue; } } if (npt.EndToken.IsValue("СТОРОНЫ", null) && npt.Preposition != null && npt.Preposition.Normal == "С") { t = npt.EndToken; continue; } if (npt.BeginToken == npt.EndToken) { Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsPreposition) { continue; } else if (mc.IsAdverb) { if (t.IsValue("ПОТОМ", null)) { continue; } } } else { } li.Clear(); Pullenti.Ner.Token t0 = t; for (Pullenti.Ner.Token tt = t; tt != null && tt.EndChar <= npt.EndChar; tt = tt.Next) { if (!(tt is Pullenti.Ner.TextToken)) { continue; } if (tt.IsValue("NATURAL", null)) { } if ((tt.LengthChar < 3) || !tt.Chars.IsLetter) { continue; } Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if ((mc.IsPreposition || mc.IsPronoun || mc.IsPersonalPronoun) || mc.IsConjunction) { if (tt.IsValue("ОТНОШЕНИЕ", null)) { } else { continue; } } if (mc.IsMisc) { if (Pullenti.Ner.Core.MiscHelper.IsEngArticle(tt)) { continue; } } KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Object }; string norm = (tt as Pullenti.Ner.TextToken).Lemma; kref.AddSlot(KeywordReferent.ATTR_VALUE, norm, false, 0); if (norm != "ЕСТЬ") { List <Pullenti.Semantic.Utils.DerivateGroup> drv = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, tt.Morph.Language); _addNormals(kref, drv, norm); } kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kref, tt, tt) { Morph = tt.Morph }; kit.EmbedToken(rt1); if (tt == t && li.Count == 0) { t0 = rt1; } t = rt1; li.Add(kref); } if (li.Count > 1) { KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Object }; tmp.Length = 0; tmp2.Clear(); bool hasNorm = false; foreach (KeywordReferent kw in li) { string s = kw.GetStringValue(KeywordReferent.ATTR_VALUE); if (tmp.Length > 0) { tmp.Append(' '); } tmp.Append(s); string n = kw.GetStringValue(KeywordReferent.ATTR_NORMAL); if (n != null) { hasNorm = true; tmp2.Add(n); } else { tmp2.Add(s); } kref.AddSlot(KeywordReferent.ATTR_REF, kw, false, 0); } string val = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); kref.AddSlot(KeywordReferent.ATTR_VALUE, val, false, 0); tmp.Length = 0; tmp2.Sort(); foreach (string s in tmp2) { if (tmp.Length > 0) { tmp.Append(' '); } tmp.Append(s); } string norm = tmp.ToString(); if (norm != val) { kref.AddSlot(KeywordReferent.ATTR_NORMAL, norm, false, 0); } kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kref, t0, t) { Morph = npt.Morph }; kit.EmbedToken(rt1); t = rt1; } } cur = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next, cur++) { KeywordReferent kw = t.GetReferent() as KeywordReferent; if (kw == null || kw.Typ != KeywordType.Object) { continue; } if (t.Next == null || kw.ChildWords > 2) { continue; } Pullenti.Ner.Token t1 = t.Next; if (t1.IsValue("OF", null) && (t1.WhitespacesAfterCount < 3) && t1.Next != null) { t1 = t1.Next; if ((t1 is Pullenti.Ner.TextToken) && Pullenti.Ner.Core.MiscHelper.IsEngArticle(t1) && t1.Next != null) { t1 = t1.Next; } } else if (!t1.Morph.Case.IsGenitive || t.WhitespacesAfterCount > 1) { continue; } KeywordReferent kw2 = t1.GetReferent() as KeywordReferent; if (kw2 == null) { continue; } if (kw == kw2) { continue; } if (kw2.Typ != KeywordType.Object || (kw.ChildWords + kw2.ChildWords) > 3) { continue; } KeywordReferent kwUn = new KeywordReferent(); kwUn.Union(kw, kw2, Pullenti.Ner.Core.MiscHelper.GetTextValue(t1, t1, Pullenti.Ner.Core.GetTextAttr.No)); kwUn = ad.RegisterReferent(kwUn) as KeywordReferent; _setRank(kwUn, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kwUn, t, t1) { Morph = t.Morph }; kit.EmbedToken(rt1); t = rt1; } if (SortKeywordsByRank) { List <Pullenti.Ner.Referent> all = new List <Pullenti.Ner.Referent>(ad.Referents); all.Sort(new CompByRank()); ad.Referents = all; } if (AnnotationMaxSentences > 0) { KeywordReferent ano = Pullenti.Ner.Keyword.Internal.AutoannoSentToken.CreateAnnotation(kit, AnnotationMaxSentences); if (ano != null) { ad.RegisterReferent(ano); } } }
/// <summary> /// Попробовать восстановить последовательность, обрамляемую кавычками или скобками. Поддерживается /// вложенность, возможность отсутствия закрывающего элемента и др. /// </summary> /// <param name="t">начальный токен</param> /// <param name="attrs">параметры выделения</param> /// <param name="maxTokens">максимально токенов (вдруг забыли закрывающую кавычку)</param> /// <return>метатокен BracketSequenceToken</return> public static BracketSequenceToken TryParse(Pullenti.Ner.Token t, BracketParseAttr attrs = BracketParseAttr.No, int maxTokens = 100) { Pullenti.Ner.Token t0 = t; int cou = 0; if (!CanBeStartOfSequence(t0, false, false)) { return(null); } List <Bracket> brList = new List <Bracket>(); brList.Add(new Bracket(t0)); cou = 0; int crlf = 0; Pullenti.Ner.Token last = null; int lev = 1; bool isAssim = brList[0].Char != '«' && m_AssymOPenChars.IndexOf(brList[0].Char) >= 0; bool genCase = false; for (t = t0.Next; t != null; t = t.Next) { if (t.IsTableControlChar) { break; } last = t; if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars)) { if (t.IsNewlineBefore && ((attrs & BracketParseAttr.CanBeManyLines)) == BracketParseAttr.No) { if (t.WhitespacesBeforeCount > 10 || CanBeStartOfSequence(t, false, false)) { if (t.IsChar('(') && !t0.IsChar('(')) { } else { last = t.Previous; break; } } } Bracket bb = new Bracket(t); brList.Add(bb); if (brList.Count > 20) { break; } if ((brList.Count == 3 && brList[1].CanBeOpen && bb.CanBeClose) && MustBeCloseChar(bb.Char, brList[1].Char) && MustBeCloseChar(bb.Char, brList[0].Char)) { bool ok = false; for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } if (tt.IsChar(',')) { break; } if (tt.IsChar('.')) { for (tt = tt.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } else if (tt.IsCharOf(m_OpenChars) || tt.IsCharOf(m_CloseChars)) { Bracket bb2 = new Bracket(tt); if (BracketHelper.CanBeEndOfSequence(tt, false, null, false) && CanBeCloseChar(bb2.Char, brList[0].Char)) { ok = true; } break; } } break; } if (t.IsCharOf(m_OpenChars) || t.IsCharOf(m_CloseChars)) { ok = true; break; } } if (!ok) { break; } } if (isAssim) { if (bb.CanBeOpen && !bb.CanBeClose && bb.Char == brList[0].Char) { lev++; } else if (bb.CanBeClose && !bb.CanBeOpen && m_OpenChars.IndexOf(brList[0].Char) == m_CloseChars.IndexOf(bb.Char)) { lev--; if (lev == 0) { break; } } } } else { if ((++cou) > maxTokens) { break; } if (((attrs & BracketParseAttr.CanContainsVerbs)) == BracketParseAttr.No) { if (t.Morph.Language.IsCyrillic) { if (t.GetMorphClassInDictionary() == Pullenti.Morph.MorphClass.Verb) { if (!t.Morph.Class.IsAdjective && !t.Morph.ContainsAttr("страд.з.", null)) { if (t.Chars.IsAllLower) { string norm = t.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); if (!Pullenti.Morph.LanguageHelper.EndsWith(norm, "СЯ")) { if (brList.Count > 1) { break; } if (brList[0].Char != '(') { break; } } } } } } else if (t.Morph.Language.IsEn) { if (t.Morph.Class == Pullenti.Morph.MorphClass.Verb && t.Chars.IsAllLower) { break; } } Pullenti.Ner.Referent r = t.GetReferent(); if (r != null && r.TypeName == "ADDRESS") { if (!t0.IsChar('(')) { break; } } } } if (((attrs & BracketParseAttr.CanBeManyLines)) != BracketParseAttr.No) { if (t.IsNewlineBefore) { if (t.NewlinesBeforeCount > 1) { break; } crlf++; } continue; } if (t.IsNewlineBefore) { if (t.WhitespacesBeforeCount > 15) { last = t.Previous; break; } crlf++; if (!t.Chars.IsAllLower) { if (MiscHelper.CanBeStartOfSentence(t)) { bool has = false; for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { break; } else if (tt.LengthChar == 1 && tt.IsCharOf(m_OpenChars) && tt.IsWhitespaceBefore) { break; } else if (tt.LengthChar == 1 && tt.IsCharOf(m_CloseChars) && !tt.IsWhitespaceBefore) { has = true; break; } } if (!has) { last = t.Previous; break; } } } if ((t.Previous is Pullenti.Ner.MetaToken) && CanBeEndOfSequence((t.Previous as Pullenti.Ner.MetaToken).EndToken, false, null, false)) { last = t.Previous; break; } } if (crlf > 1) { if (brList.Count > 1) { break; } if (crlf > 10) { break; } } if (t.IsChar(';') && t.IsNewlineAfter) { break; } NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null); if (npt != null) { if (t.IsNewlineBefore) { genCase = npt.Morph.Case.IsGenitive; } last = (t = npt.EndToken); } } if ((brList.Count == 1 && brList[0].CanBeOpen && (last is Pullenti.Ner.MetaToken)) && last.IsNewlineAfter) { if (BracketHelper.CanBeEndOfSequence((last as Pullenti.Ner.MetaToken).EndToken, false, null, false)) { return(new BracketSequenceToken(t0, last)); } } if ((brList.Count == 1 && brList[0].CanBeOpen && genCase) && last.IsNewlineAfter && crlf <= 2) { return(new BracketSequenceToken(t0, last)); } if (brList.Count < 1) { return(null); } for (int i = 1; i < (brList.Count - 1); i++) { if (brList[i].Char == '<' && brList[i + 1].Char == '>') { brList[i].CanBeOpen = true; brList[i + 1].CanBeClose = true; } } List <BracketSequenceToken> internals = null; while (brList.Count > 3) { int i = brList.Count - 1; if ((brList[i].CanBeClose && brList[i - 1].CanBeOpen && !CanBeCloseChar(brList[i].Char, brList[0].Char)) && CanBeCloseChar(brList[i].Char, brList[i - 1].Char)) { brList.RemoveRange(brList.Count - 2, 2); continue; } break; } while (brList.Count >= 4) { bool changed = false; for (int i = 1; i < (brList.Count - 2); i++) { if ((brList[i].CanBeOpen && !brList[i].CanBeClose && brList[i + 1].CanBeClose) && !brList[i + 1].CanBeOpen) { bool ok = false; if (MustBeCloseChar(brList[i + 1].Char, brList[i].Char) || brList[i].Char != brList[0].Char) { ok = true; if ((i == 1 && ((i + 2) < brList.Count) && brList[i + 2].Char == ')') && brList[i + 1].Char != ')' && CanBeCloseChar(brList[i + 1].Char, brList[i - 1].Char)) { brList[i + 2] = brList[i + 1]; } } else if (i > 1 && ((i + 2) < brList.Count) && MustBeCloseChar(brList[i + 2].Char, brList[i - 1].Char)) { ok = true; } if (ok) { if (internals == null) { internals = new List <BracketSequenceToken>(); } internals.Add(new BracketSequenceToken(brList[i].Source, brList[i + 1].Source)); brList.RemoveRange(i, 2); changed = true; break; } } } if (!changed) { break; } } BracketSequenceToken res = null; if ((brList.Count >= 4 && brList[1].CanBeOpen && brList[2].CanBeClose) && brList[3].CanBeClose && !brList[3].CanBeOpen) { if (CanBeCloseChar(brList[3].Char, brList[0].Char)) { res = new BracketSequenceToken(brList[0].Source, brList[3].Source); if (brList[0].Source.Next != brList[1].Source || brList[2].Source.Next != brList[3].Source) { res.Internal.Add(new BracketSequenceToken(brList[1].Source, brList[2].Source)); } if (internals != null) { res.Internal.AddRange(internals); } } } if ((res == null && brList.Count >= 3 && brList[2].CanBeClose) && !brList[2].CanBeOpen) { if (((attrs & BracketParseAttr.NearCloseBracket)) != BracketParseAttr.No) { if (CanBeCloseChar(brList[1].Char, brList[0].Char)) { return(new BracketSequenceToken(brList[0].Source, brList[1].Source)); } } bool ok = true; if (CanBeCloseChar(brList[2].Char, brList[0].Char) && CanBeCloseChar(brList[1].Char, brList[0].Char) && brList[1].CanBeClose) { for (t = brList[1].Source; t != brList[2].Source && t != null; t = t.Next) { if (t.IsNewlineBefore) { ok = false; break; } if (t.Chars.IsLetter && t.Chars.IsAllLower) { ok = false; break; } NounPhraseToken npt = NounPhraseHelper.TryParse(t, NounPhraseParseAttr.No, 0, null); if (npt != null) { t = npt.EndToken; } } if (ok) { for (t = brList[0].Source.Next; t != brList[1].Source && t != null; t = t.Next) { if (t.IsNewlineBefore) { return(new BracketSequenceToken(brList[0].Source, t.Previous)); } } } int lev1 = 0; for (Pullenti.Ner.Token tt = brList[0].Source.Previous; tt != null; tt = tt.Previous) { if (tt.IsNewlineAfter || tt.IsTableControlChar) { break; } if (!(tt is Pullenti.Ner.TextToken)) { continue; } if (tt.Chars.IsLetter || tt.LengthChar > 1) { continue; } char ch = (tt as Pullenti.Ner.TextToken).Term[0]; if (CanBeCloseChar(ch, brList[0].Char)) { lev1++; } else if (CanBeCloseChar(brList[1].Char, ch)) { lev1--; if (lev1 < 0) { return(new BracketSequenceToken(brList[0].Source, brList[1].Source)); } } } } if (ok && CanBeCloseChar(brList[2].Char, brList[0].Char)) { BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source); res = new BracketSequenceToken(brList[0].Source, brList[2].Source); res.Internal.Add(intern); } else if (ok && CanBeCloseChar(brList[2].Char, brList[1].Char) && brList[0].CanBeOpen) { if (CanBeCloseChar(brList[2].Char, brList[0].Char)) { BracketSequenceToken intern = new BracketSequenceToken(brList[1].Source, brList[2].Source); res = new BracketSequenceToken(brList[0].Source, brList[2].Source); res.Internal.Add(intern); } else if (brList.Count == 3) { return(null); } } } if (res == null && brList.Count > 1 && brList[1].CanBeClose) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res == null && brList.Count > 1 && CanBeCloseChar(brList[1].Char, brList[0].Char)) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res == null && brList.Count == 2 && brList[0].Char == brList[1].Char) { res = new BracketSequenceToken(brList[0].Source, brList[1].Source); } if (res != null && internals != null) { foreach (BracketSequenceToken i in internals) { if (i.BeginChar < res.EndChar) { res.Internal.Add(i); } } } if (res == null) { cou = 0; for (Pullenti.Ner.Token tt = t0.Next; tt != null; tt = tt.Next, cou++) { if (tt.IsTableControlChar) { break; } if (MiscHelper.CanBeStartOfSentence(tt)) { break; } if (maxTokens > 0 && cou > maxTokens) { break; } Pullenti.Ner.MetaToken mt = tt as Pullenti.Ner.MetaToken; if (mt == null) { continue; } if (mt.EndToken is Pullenti.Ner.TextToken) { if ((mt.EndToken as Pullenti.Ner.TextToken).IsCharOf(m_CloseChars)) { Bracket bb = new Bracket(mt.EndToken as Pullenti.Ner.TextToken); if (bb.CanBeClose && CanBeCloseChar(bb.Char, brList[0].Char)) { return(new BracketSequenceToken(t0, tt)); } } } } } return(res); }
static string GetNameWithoutBrackets(Pullenti.Ner.Token begin, Pullenti.Ner.Token end, bool normalizeFirstNounGroup = false, bool normalFirstGroupSingle = false, bool ignoreGeoReferent = false) { string res = null; if (BracketHelper.CanBeStartOfSequence(begin, false, false) && BracketHelper.CanBeEndOfSequence(end, false, begin, false)) { begin = begin.Next; end = end.Previous; } if (normalizeFirstNounGroup && !begin.Morph.Class.IsPreposition) { NounPhraseToken npt = NounPhraseHelper.TryParse(begin, NounPhraseParseAttr.ReferentCanBeNoun, 0, null); if (npt != null) { if (npt.Noun.GetMorphClassInDictionary().IsUndefined&& npt.Adjectives.Count == 0) { npt = null; } } if (npt != null && npt.EndToken.EndChar > end.EndChar) { npt = null; } if (npt != null) { res = npt.GetNormalCaseText(null, (normalFirstGroupSingle ? Pullenti.Morph.MorphNumber.Singular : Pullenti.Morph.MorphNumber.Undefined), Pullenti.Morph.MorphGender.Undefined, false); Pullenti.Ner.Token te = npt.EndToken.Next; if (((te != null && te.Next != null && te.IsComma) && (te.Next is Pullenti.Ner.TextToken) && te.Next.EndChar <= end.EndChar) && te.Next.Morph.Class.IsVerb && te.Next.Morph.Class.IsAdjective) { foreach (Pullenti.Morph.MorphBaseInfo it in te.Next.Morph.Items) { if (it.Gender == npt.Morph.Gender || ((it.Gender & npt.Morph.Gender)) != Pullenti.Morph.MorphGender.Undefined) { if (!((it.Case & npt.Morph.Case)).IsUndefined) { if (it.Number == npt.Morph.Number || ((it.Number & npt.Morph.Number)) != Pullenti.Morph.MorphNumber.Undefined) { string var = (te.Next as Pullenti.Ner.TextToken).Term; if (it is Pullenti.Morph.MorphWordForm) { var = (it as Pullenti.Morph.MorphWordForm).NormalCase; } Pullenti.Morph.MorphBaseInfo bi = new Pullenti.Morph.MorphBaseInfo() { Class = Pullenti.Morph.MorphClass.Adjective, Gender = npt.Morph.Gender, Number = npt.Morph.Number, Language = npt.Morph.Language }; var = Pullenti.Morph.MorphologyService.GetWordform(var, bi); if (var != null) { res = string.Format("{0}, {1}", res, var); te = te.Next.Next; } break; } } } } } if (te != null && te.EndChar <= end.EndChar) { string s = GetNameEx(te, end, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, true, ignoreGeoReferent); if (!string.IsNullOrEmpty(s)) { if (!char.IsLetterOrDigit(s[0])) { res = string.Format("{0}{1}", res, s); } else { res = string.Format("{0} {1}", res, s); } } } } else if ((begin is Pullenti.Ner.TextToken) && begin.Chars.IsCyrillicLetter) { Pullenti.Morph.MorphClass mm = begin.GetMorphClassInDictionary(); if (!mm.IsUndefined) { res = begin.GetNormalCaseText(mm, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); if (begin.EndChar < end.EndChar) { res = string.Format("{0} {1}", res, GetNameEx(begin.Next, end, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, true, false)); } } } } if (res == null) { res = GetNameEx(begin, end, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, true, ignoreGeoReferent); } if (!string.IsNullOrEmpty(res)) { int k = 0; for (int i = res.Length - 1; i >= 0; i--, k++) { if (res[i] == '*' || char.IsWhiteSpace(res[i])) { } else { break; } } if (k > 0) { if (k == res.Length) { return(null); } res = res.Substring(0, res.Length - k); } } return(res); }