public static WeaponItemToken TryParse(Pullenti.Ner.Token t, WeaponItemToken prev, bool afterConj, bool attachHigh = false) { WeaponItemToken res = _TryParse(t, prev, afterConj, attachHigh); if (res == null) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.Noun.BeginChar > npt.BeginChar) { res = _TryParse(npt.Noun.BeginToken, prev, afterConj, attachHigh); if (res != null) { if (res.Typ == Typs.Noun) { string str = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); if (str == "РУЧНОЙ ГРАНАТ") { str = "РУЧНАЯ ГРАНАТА"; } if (((str ?? "")).EndsWith(res.Value)) { if (res.AltValue == null) { res.AltValue = str; } else { str = str.Substring(0, str.Length - res.Value.Length).Trim(); res.AltValue = string.Format("{0} {1}", str, res.AltValue); } res.BeginToken = t; return(res); } } } } return(null); } if (res.Typ == Typs.Name) { Pullenti.Ner.Core.BracketSequenceToken br = Pullenti.Ner.Core.BracketHelper.TryParse(res.EndToken.Next, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (br != null && br.IsChar('(')) { string alt = Pullenti.Ner.Core.MiscHelper.GetTextValueOfMetaToken(br, Pullenti.Ner.Core.GetTextAttr.No); if (Pullenti.Ner.Core.MiscHelper.CanBeEqualCyrAndLatSS(res.Value, alt)) { res.AltValue = alt; res.EndToken = br.EndToken; } } } return(res); }
public static TitleItemToken TryAttach(Pullenti.Ner.Token t) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { Pullenti.Ner.Token t1 = (Pullenti.Ner.Token)tt; if (tt.Term == "ТЕМА") { TitleItemToken tit = TryAttach(tt.Next); if (tit != null && tit.Typ == Types.Typ) { t1 = tit.EndToken; if (t1.Next != null && t1.Next.IsChar(':')) { t1 = t1.Next; } return(new TitleItemToken(t, t1, Types.TypAndTheme) { Value = tit.Value }); } if (tt.Next != null && tt.Next.IsChar(':')) { t1 = tt.Next; } return(new TitleItemToken(tt, t1, Types.Theme)); } if (tt.Term == "ПО" || tt.Term == "НА") { if (tt.Next != null && tt.Next.IsValue("ТЕМА", null)) { t1 = tt.Next; if (t1.Next != null && t1.Next.IsChar(':')) { t1 = t1.Next; } return(new TitleItemToken(tt, t1, Types.Theme)); } } if (tt.Term == "ПЕРЕВОД" || tt.Term == "ПЕР") { Pullenti.Ner.Token tt2 = tt.Next; if (tt2 != null && tt2.IsChar('.')) { tt2 = tt2.Next; } if (tt2 is Pullenti.Ner.TextToken) { if ((tt2 as Pullenti.Ner.TextToken).Term == "C" || (tt2 as Pullenti.Ner.TextToken).Term == "С") { tt2 = tt2.Next; if (tt2 is Pullenti.Ner.TextToken) { return(new TitleItemToken(t, tt2, Types.Translate)); } } } } if (tt.Term == "СЕКЦИЯ" || tt.Term == "SECTION" || tt.Term == "СЕКЦІЯ") { t1 = tt.Next; if (t1 != null && t1.IsChar(':')) { t1 = t1.Next; } Pullenti.Ner.Core.BracketSequenceToken br = Pullenti.Ner.Core.BracketHelper.TryParse(t1, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (br != null) { t1 = br.EndToken; } else if (t1 != tt.Next) { for (; t1 != null; t1 = t1.Next) { if (t1.IsNewlineAfter) { break; } } if (t1 == null) { return(null); } } if (t1 != tt.Next) { return(new TitleItemToken(tt, t1, Types.Dust)); } } t1 = null; if (tt.IsValue("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")) { t1 = tt.Next; } else if (tt.Morph.Class.IsPreposition && tt.Next != null && tt.Next.IsValue("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")) { t1 = tt.Next.Next; } else if (tt.IsChar('/') && tt.IsNewlineBefore) { t1 = tt.Next; } if (t1 != null) { if (t1.IsCharOf(":") || t1.IsHiphen) { t1 = t1.Next; } TitleItemToken spec = TryAttachSpeciality(t1, true); if (spec != null) { spec.BeginToken = t; return(spec); } } } TitleItemToken sss = TryAttachSpeciality(t, false); if (sss != null) { return(sss); } if (t is Pullenti.Ner.ReferentToken) { return(null); } Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null) { string s = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); Pullenti.Ner.Core.TerminToken tok = m_Termins.TryParse(npt.EndToken, Pullenti.Ner.Core.TerminParseAttr.No); if (tok != null) { Types ty = (Types)tok.Termin.Tag; if (ty == Types.Typ) { TitleItemToken tit = TryAttach(tok.EndToken.Next); if (tit != null && tit.Typ == Types.Theme) { return new TitleItemToken(npt.BeginToken, tit.EndToken, Types.TypAndTheme) { Value = s } } ; if (s == "РАБОТА" || s == "РОБОТА" || s == "ПРОЕКТ") { return(null); } Pullenti.Ner.Token t1 = tok.EndToken; if (s == "ДИССЕРТАЦИЯ" || s == "ДИСЕРТАЦІЯ") { int err = 0; for (Pullenti.Ner.Token ttt = t1.Next; ttt != null; ttt = ttt.Next) { if (ttt.Morph.Class.IsPreposition) { continue; } if (ttt.IsValue("СОИСКАНИЕ", "")) { continue; } Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(ttt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt1 != null && npt1.Noun.IsValue("СТЕПЕНЬ", "СТУПІНЬ")) { t1 = (ttt = npt1.EndToken); continue; } Pullenti.Ner.ReferentToken rt = t1.Kit.ProcessReferent("PERSON", ttt); if (rt != null && (rt.Referent is Pullenti.Ner.Person.PersonPropertyReferent)) { Pullenti.Ner.Person.PersonPropertyReferent ppr = rt.Referent as Pullenti.Ner.Person.PersonPropertyReferent; if (ppr.Name == "доктор наук") { t1 = rt.EndToken; s = "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"; break; } else if (ppr.Name == "кандидат наук") { t1 = rt.EndToken; s = "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"; break; } else if (ppr.Name == "магистр") { t1 = rt.EndToken; s = "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"; break; } } if (ttt.IsValue("ДОКТОР", null) || ttt.IsValue("КАНДИДАТ", null) || ttt.IsValue("МАГИСТР", "МАГІСТР")) { t1 = ttt; npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(ttt.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt1 != null && npt1.EndToken.IsValue("НАУК", null)) { t1 = npt1.EndToken; } s = (ttt.IsValue("МАГИСТР", "МАГІСТР") ? "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ" : (ttt.IsValue("ДОКТОР", null) ? "ДОКТОРСКАЯ ДИССЕРТАЦИЯ" : "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ")); break; } if ((++err) > 3) { break; } } } if (t1.Next != null && t1.Next.IsChar('.')) { t1 = t1.Next; } if (s.EndsWith("ОТЧЕТ") && t1.Next != null && t1.Next.IsValue("О", null)) { Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t1.Next, Pullenti.Ner.Core.NounPhraseParseAttr.ParsePreposition, 0, null); if (npt1 != null && npt1.Morph.Case.IsPrepositional) { t1 = npt1.EndToken; } } return(new TitleItemToken(npt.BeginToken, t1, ty) { Value = s }); } } } Pullenti.Ner.Core.TerminToken tok1 = m_Termins.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (tok1 != null) { Pullenti.Ner.Token t1 = tok1.EndToken; TitleItemToken re = new TitleItemToken(tok1.BeginToken, t1, (Types)tok1.Termin.Tag); return(re); } if (Pullenti.Ner.Core.BracketHelper.CanBeStartOfSequence(t, false, false)) { tok1 = m_Termins.TryParse(t.Next, Pullenti.Ner.Core.TerminParseAttr.No); if (tok1 != null && Pullenti.Ner.Core.BracketHelper.CanBeEndOfSequence(tok1.EndToken.Next, false, null, false)) { Pullenti.Ner.Token t1 = tok1.EndToken.Next; return(new TitleItemToken(tok1.BeginToken, t1, (Types)tok1.Termin.Tag)); } } return(null); }
public static void Main(string[] args) { Stopwatch sw = Stopwatch.StartNew(); // инициализация - необходимо проводить один раз до обработки текстов Console.Write("Initializing SDK Pullenti ver {0} ({1}) ... ", Pullenti.Sdk.Version, Pullenti.Sdk.VersionDate); // инициализируются движок и все имеющиеся анализаторы Pullenti.Sdk.InitializeAll(); sw.Stop(); Console.WriteLine("OK (by {0} ms), version {1}", (int)sw.ElapsedMilliseconds, Pullenti.Ner.ProcessorService.Version); // посмотрим, какие анализаторы доступны foreach (Pullenti.Ner.Analyzer a in Pullenti.Ner.ProcessorService.Analyzers) { Console.WriteLine(" {0} {1} \"{2}\"", (a.IsSpecific ? "Specific analyzer" : "Common analyzer"), a.Name, a.Caption); } // анализируемый текст string txt = "Система разрабатывается с 2011 года российским программистом Михаилом Жуковым, проживающим в Москве на Красной площади в доме номер один на втором этаже. Конкурентов у него много: Abbyy, Yandex, ООО \"Russian Context Optimizer\" (RCO) и другие компании. Он планирует продать SDK за 1.120.000.001,99 (миллиард сто двадцать миллионов один рубль 99 копеек) рублей, без НДС."; Console.WriteLine("Text: {0}", txt); // запускаем обработку на пустом процессоре (без анализаторов NER) Pullenti.Ner.AnalysisResult are = Pullenti.Ner.ProcessorService.EmptyProcessor.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); Console.Write("Noun groups: "); // перебираем токены for (Pullenti.Ner.Token t = are.FirstToken; t != null; t = t.Next) { // выделяем именную группу с текущего токена Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); // не получилось if (npt == null) { continue; } // получилось, выводим в нормализованном виде Console.Write("[{0}=>{1}] ", npt.GetSourceText(), npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false)); // указатель на последний токен именной группы t = npt.EndToken; } using (Pullenti.Ner.Processor proc = Pullenti.Ner.ProcessorService.CreateProcessor()) { // анализируем текст Pullenti.Ner.AnalysisResult ar = proc.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); // результирующие сущности Console.WriteLine("\r\n==========================================\r\nEntities: "); foreach (Pullenti.Ner.Referent e in ar.Entities) { Console.WriteLine("{0}: {1}", e.TypeName, e.ToString()); foreach (Pullenti.Ner.Slot s in e.Slots) { Console.WriteLine(" {0}: {1}", s.TypeName, s.Value); } } // пример выделения именных групп Console.WriteLine("\r\n==========================================\r\nNoun groups: "); for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { // токены с сущностями игнорируем if (t.GetReferent() != null) { continue; } // пробуем создать именную группу Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.AdjectiveCanBeLast, 0, null); // не получилось if (npt == null) { continue; } Console.WriteLine(npt); // указатель перемещаем на последний токен группы t = npt.EndToken; } } using (Pullenti.Ner.Processor proc = Pullenti.Ner.ProcessorService.CreateSpecificProcessor(Pullenti.Ner.Keyword.KeywordAnalyzer.ANALYZER_NAME)) { Pullenti.Ner.AnalysisResult ar = proc.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); Console.WriteLine("\r\n==========================================\r\nKeywords1: "); foreach (Pullenti.Ner.Referent e in ar.Entities) { if (e is Pullenti.Ner.Keyword.KeywordReferent) { Console.WriteLine(e); } } Console.WriteLine("\r\n==========================================\r\nKeywords2: "); for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { if (t is Pullenti.Ner.ReferentToken) { Pullenti.Ner.Keyword.KeywordReferent kw = t.GetReferent() as Pullenti.Ner.Keyword.KeywordReferent; if (kw == null) { continue; } string kwstr = Pullenti.Ner.Core.MiscHelper.GetTextValueOfMetaToken(t as Pullenti.Ner.ReferentToken, Pullenti.Ner.Core.GetTextAttr.FirstNounGroupToNominativeSingle | Pullenti.Ner.Core.GetTextAttr.KeepRegister); Console.WriteLine("{0} = {1}", kwstr, kw); } } } Console.WriteLine("Over!"); }
public static DefinitionWithNumericToken TryParse(Pullenti.Ner.Token t) { if (!Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(t)) { return(null); } Pullenti.Ner.Token tt = t; Pullenti.Ner.Core.NounPhraseToken noun = null; Pullenti.Ner.NumberToken num = null; for (; tt != null; tt = tt.Next) { if (tt != t && Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(tt)) { return(null); } if (!(tt is Pullenti.Ner.NumberToken)) { continue; } if (tt.WhitespacesAfterCount > 2 || tt == t) { continue; } if (tt.Morph.Class.IsAdjective) { continue; } Pullenti.Ner.Core.NounPhraseToken nn = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (nn == null) { continue; } num = tt as Pullenti.Ner.NumberToken; noun = nn; break; } if (num == null || num.IntValue == null) { return(null); } DefinitionWithNumericToken res = new DefinitionWithNumericToken(t, noun.EndToken); res.Number = num.IntValue.Value; res.NumberBeginChar = num.BeginChar; res.NumberEndChar = num.EndChar; res.Noun = noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); res.NounsGenetive = noun.GetMorphVariant(Pullenti.Morph.MorphCase.Genitive, true) ?? res.Noun; res.Text = Pullenti.Ner.Core.MiscHelper.GetTextValue(t, num.Previous, Pullenti.Ner.Core.GetTextAttr.KeepQuotes | Pullenti.Ner.Core.GetTextAttr.KeepRegister); if (num.IsWhitespaceBefore) { res.Text += " "; } res.NumberSubstring = Pullenti.Ner.Core.MiscHelper.GetTextValue(num, noun.EndToken, Pullenti.Ner.Core.GetTextAttr.KeepQuotes | Pullenti.Ner.Core.GetTextAttr.KeepRegister); res.Text += res.NumberSubstring; for (tt = noun.EndToken; tt != null; tt = tt.Next) { if (Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(tt)) { break; } res.EndToken = tt; } if (res.EndToken != noun.EndToken) { if (noun.IsWhitespaceAfter) { res.Text += " "; } res.Text += Pullenti.Ner.Core.MiscHelper.GetTextValue(noun.EndToken.Next, res.EndToken, Pullenti.Ner.Core.GetTextAttr.KeepQuotes | Pullenti.Ner.Core.GetTextAttr.KeepRegister); } return(res); }
// Основная функция выделения телефонов public override void Process(Pullenti.Ner.Core.AnalysisKit kit) { Pullenti.Ner.Core.AnalyzerData ad = kit.GetAnalyzerData(this); bool hasDenoms = false; foreach (Pullenti.Ner.Analyzer a in kit.Processor.Analyzers) { if ((a is Pullenti.Ner.Denomination.DenominationAnalyzer) && !a.IgnoreThisAnalyzer) { hasDenoms = true; } } if (!hasDenoms) { Pullenti.Ner.Denomination.DenominationAnalyzer a = new Pullenti.Ner.Denomination.DenominationAnalyzer(); a.Process(kit); } List <KeywordReferent> li = new List <KeywordReferent>(); StringBuilder tmp = new StringBuilder(); List <string> tmp2 = new List <string>(); int max = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next) { max++; } int cur = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next, cur++) { Pullenti.Ner.Referent r = t.GetReferent(); if (r != null) { t = this._addReferents(ad, t, cur, max); continue; } if (!(t is Pullenti.Ner.TextToken)) { continue; } if (!t.Chars.IsLetter || (t.LengthChar < 3)) { continue; } string term = (t as Pullenti.Ner.TextToken).Term; if (term == "ЕСТЬ") { if ((t.Previous is Pullenti.Ner.TextToken) && t.Previous.Morph.Class.IsVerb) { } else { continue; } } Pullenti.Ner.Core.NounPhraseToken npt = null; npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.AdjectiveCanBeLast | Pullenti.Ner.Core.NounPhraseParseAttr.ParsePreposition, 0, null); if (npt == null) { Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsVerb && !mc.IsPreposition) { if ((t as Pullenti.Ner.TextToken).IsVerbBe) { continue; } if (t.IsValue("МОЧЬ", null) || t.IsValue("WOULD", null)) { continue; } KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Predicate }; string norm = t.GetNormalCaseText(Pullenti.Morph.MorphClass.Verb, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); if (norm == null) { norm = (t as Pullenti.Ner.TextToken).Lemma; } if (norm.EndsWith("ЬСЯ")) { norm = norm.Substring(0, norm.Length - 2); } kref.AddSlot(KeywordReferent.ATTR_VALUE, norm, false, 0); List <Pullenti.Semantic.Utils.DerivateGroup> drv = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, t.Morph.Language); _addNormals(kref, drv, norm); kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(ad.RegisterReferent(kref), t, t) { Morph = t.Morph }; kit.EmbedToken(rt1); t = rt1; continue; } continue; } if (npt.InternalNoun != null) { continue; } if (npt.EndToken.IsValue("ЦЕЛОМ", null) || npt.EndToken.IsValue("ЧАСТНОСТИ", null)) { if (npt.Preposition != null) { t = npt.EndToken; continue; } } if (npt.EndToken.IsValue("СТОРОНЫ", null) && npt.Preposition != null && npt.Preposition.Normal == "С") { t = npt.EndToken; continue; } if (npt.BeginToken == npt.EndToken) { Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsPreposition) { continue; } else if (mc.IsAdverb) { if (t.IsValue("ПОТОМ", null)) { continue; } } } else { } li.Clear(); Pullenti.Ner.Token t0 = t; for (Pullenti.Ner.Token tt = t; tt != null && tt.EndChar <= npt.EndChar; tt = tt.Next) { if (!(tt is Pullenti.Ner.TextToken)) { continue; } if (tt.IsValue("NATURAL", null)) { } if ((tt.LengthChar < 3) || !tt.Chars.IsLetter) { continue; } Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if ((mc.IsPreposition || mc.IsPronoun || mc.IsPersonalPronoun) || mc.IsConjunction) { if (tt.IsValue("ОТНОШЕНИЕ", null)) { } else { continue; } } if (mc.IsMisc) { if (Pullenti.Ner.Core.MiscHelper.IsEngArticle(tt)) { continue; } } KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Object }; string norm = (tt as Pullenti.Ner.TextToken).Lemma; kref.AddSlot(KeywordReferent.ATTR_VALUE, norm, false, 0); if (norm != "ЕСТЬ") { List <Pullenti.Semantic.Utils.DerivateGroup> drv = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, tt.Morph.Language); _addNormals(kref, drv, norm); } kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kref, tt, tt) { Morph = tt.Morph }; kit.EmbedToken(rt1); if (tt == t && li.Count == 0) { t0 = rt1; } t = rt1; li.Add(kref); } if (li.Count > 1) { KeywordReferent kref = new KeywordReferent() { Typ = KeywordType.Object }; tmp.Length = 0; tmp2.Clear(); bool hasNorm = false; foreach (KeywordReferent kw in li) { string s = kw.GetStringValue(KeywordReferent.ATTR_VALUE); if (tmp.Length > 0) { tmp.Append(' '); } tmp.Append(s); string n = kw.GetStringValue(KeywordReferent.ATTR_NORMAL); if (n != null) { hasNorm = true; tmp2.Add(n); } else { tmp2.Add(s); } kref.AddSlot(KeywordReferent.ATTR_REF, kw, false, 0); } string val = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); kref.AddSlot(KeywordReferent.ATTR_VALUE, val, false, 0); tmp.Length = 0; tmp2.Sort(); foreach (string s in tmp2) { if (tmp.Length > 0) { tmp.Append(' '); } tmp.Append(s); } string norm = tmp.ToString(); if (norm != val) { kref.AddSlot(KeywordReferent.ATTR_NORMAL, norm, false, 0); } kref = ad.RegisterReferent(kref) as KeywordReferent; _setRank(kref, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kref, t0, t) { Morph = npt.Morph }; kit.EmbedToken(rt1); t = rt1; } } cur = 0; for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next, cur++) { KeywordReferent kw = t.GetReferent() as KeywordReferent; if (kw == null || kw.Typ != KeywordType.Object) { continue; } if (t.Next == null || kw.ChildWords > 2) { continue; } Pullenti.Ner.Token t1 = t.Next; if (t1.IsValue("OF", null) && (t1.WhitespacesAfterCount < 3) && t1.Next != null) { t1 = t1.Next; if ((t1 is Pullenti.Ner.TextToken) && Pullenti.Ner.Core.MiscHelper.IsEngArticle(t1) && t1.Next != null) { t1 = t1.Next; } } else if (!t1.Morph.Case.IsGenitive || t.WhitespacesAfterCount > 1) { continue; } KeywordReferent kw2 = t1.GetReferent() as KeywordReferent; if (kw2 == null) { continue; } if (kw == kw2) { continue; } if (kw2.Typ != KeywordType.Object || (kw.ChildWords + kw2.ChildWords) > 3) { continue; } KeywordReferent kwUn = new KeywordReferent(); kwUn.Union(kw, kw2, Pullenti.Ner.Core.MiscHelper.GetTextValue(t1, t1, Pullenti.Ner.Core.GetTextAttr.No)); kwUn = ad.RegisterReferent(kwUn) as KeywordReferent; _setRank(kwUn, cur, max); Pullenti.Ner.ReferentToken rt1 = new Pullenti.Ner.ReferentToken(kwUn, t, t1) { Morph = t.Morph }; kit.EmbedToken(rt1); t = rt1; } if (SortKeywordsByRank) { List <Pullenti.Ner.Referent> all = new List <Pullenti.Ner.Referent>(ad.Referents); all.Sort(new CompByRank()); ad.Referents = all; } if (AnnotationMaxSentences > 0) { KeywordReferent ano = Pullenti.Ner.Keyword.Internal.AutoannoSentToken.CreateAnnotation(kit, AnnotationMaxSentences); if (ano != null) { ad.RegisterReferent(ano); } } }
static OrgItemNameToken _TryAttach(Pullenti.Ner.Token t, OrgItemNameToken prev, bool extOnto) { if (t == null) { return(null); } Pullenti.Ner.Referent r = t.GetReferent(); if (r != null) { if (r.TypeName == "DENOMINATION") { return new OrgItemNameToken(t, t) { Value = r.ToString(true, t.Kit.BaseLanguage, 0), IsDenomination = true } } ; if ((r is Pullenti.Ner.Geo.GeoReferent) && t.Chars.IsLatinLetter) { OrgItemNameToken res2 = _TryAttach(t.Next, prev, extOnto); if (res2 != null && res2.Chars.IsLatinLetter) { res2.BeginToken = t; res2.Value = string.Format("{0} {1}", Pullenti.Ner.Core.MiscHelper.GetTextValueOfMetaToken(t as Pullenti.Ner.MetaToken, Pullenti.Ner.Core.GetTextAttr.No), res2.Value); res2.IsInDictionary = false; return(res2); } } return(null); } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(null); } OrgItemNameToken res = null; Pullenti.Ner.Core.TerminToken tok = m_StdTails.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (tok == null && t.IsChar(',')) { tok = m_StdTails.TryParse(t.Next, Pullenti.Ner.Core.TerminParseAttr.No); } if (tok != null) { return new OrgItemNameToken(t, tok.EndToken) { Value = tok.Termin.CanonicText, IsStdTail = tok.Termin.Tag == null, IsEmptyWord = tok.Termin.Tag != null, Morph = tok.Morph } } ; if ((((tok = m_StdNames.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No)))) != null) { return new OrgItemNameToken(t, tok.EndToken) { Value = tok.Termin.CanonicText, IsStdName = true } } ; OrgItemEngItem eng = OrgItemEngItem.TryAttach(t, false); if (eng == null && t.IsChar(',')) { eng = OrgItemEngItem.TryAttach(t.Next, false); } if (eng != null) { return new OrgItemNameToken(t, eng.EndToken) { Value = eng.FullValue, IsStdTail = true } } ; if (tt.Chars.IsAllLower && prev != null) { if (!prev.Chars.IsAllLower && !prev.Chars.IsCapitalUpper) { return(null); } } if (tt.IsChar(',') && prev != null) { Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt1 == null || npt1.Chars != prev.Chars || ((npt1.Morph.Case & prev.Morph.Case)).IsUndefined) { return(null); } OrgItemTypeToken ty = OrgItemTypeToken.TryAttach(t.Next, false, null); if (ty != null) { return(null); } if (npt1.EndToken.Next == null || !npt1.EndToken.Next.IsValue("И", null)) { return(null); } Pullenti.Ner.Token t1 = npt1.EndToken.Next; Pullenti.Ner.Core.NounPhraseToken npt2 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t1.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt2 == null || npt2.Chars != prev.Chars || ((npt2.Morph.Case & npt1.Morph.Case & prev.Morph.Case)).IsUndefined) { return(null); } ty = OrgItemTypeToken.TryAttach(t1.Next, false, null); if (ty != null) { return(null); } res = new OrgItemNameToken(npt1.BeginToken, npt1.EndToken) { Morph = npt1.Morph, Value = npt1.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false) }; res.IsNounPhrase = true; res.IsAfterConjunction = true; if (prev.Preposition != null) { res.Preposition = prev.Preposition; } return(res); } if (((tt.IsChar('&') || tt.IsValue("AND", null) || tt.IsValue("UND", null))) && prev != null) { if ((tt.Next is Pullenti.Ner.TextToken) && tt.LengthChar == 1 && tt.Next.Chars.IsLatinLetter) { res = new OrgItemNameToken(tt, tt.Next) { Chars = tt.Next.Chars }; res.IsAfterConjunction = true; res.Value = "& " + (tt.Next as Pullenti.Ner.TextToken).Term; return(res); } res = OrgItemNameToken.TryAttach(tt.Next, null, extOnto, false); if (res == null || res.Chars != prev.Chars) { return(null); } res.IsAfterConjunction = true; res.Value = "& " + res.Value; return(res); } if (!tt.Chars.IsLetter) { return(null); } List <Pullenti.Semantic.Utils.DerivateGroup> expinf = null; if (prev != null && prev.EndToken.GetMorphClassInDictionary().IsNoun) { string wo = prev.EndToken.GetNormalCaseText(Pullenti.Morph.MorphClass.Noun, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); expinf = Pullenti.Semantic.Utils.DerivateService.FindDerivates(wo, true, prev.EndToken.Morph.Language); } Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.InternalNoun != null) { npt = null; } bool explOk = false; if (npt != null && prev != null && prev.EndToken.GetMorphClassInDictionary().IsNoun) { Pullenti.Ner.Core.NounPhraseToken npt0 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(prev.EndToken, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt0 != null) { List <Pullenti.Semantic.Core.SemanticLink> links = Pullenti.Semantic.Core.SemanticHelper.TryCreateLinks(npt0, npt, null); if (links.Count > 0) { explOk = true; } } } if (npt != null && ((explOk || npt.Morph.Case.IsGenitive || ((prev != null && !((prev.Morph.Case & npt.Morph.Case)).IsUndefined))))) { Pullenti.Morph.MorphClass mc = npt.BeginToken.GetMorphClassInDictionary(); if (mc.IsVerb || mc.IsPronoun) { return(null); } if (mc.IsAdverb) { if (npt.BeginToken.Next != null && npt.BeginToken.Next.IsHiphen) { } else { return(null); } } if (mc.IsPreposition) { return(null); } if (mc.IsNoun && npt.Chars.IsAllLower) { Pullenti.Morph.MorphCase ca = npt.Morph.Case; if ((!ca.IsDative && !ca.IsGenitive && !ca.IsInstrumental) && !ca.IsPrepositional) { return(null); } } res = new OrgItemNameToken(npt.BeginToken, npt.EndToken) { Morph = npt.Morph, Value = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false) }; res.IsNounPhrase = true; if ((npt.EndToken.WhitespacesAfterCount < 2) && (npt.EndToken.Next is Pullenti.Ner.TextToken)) { Pullenti.Ner.Core.NounPhraseToken npt2 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(npt.EndToken.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt2 != null && npt2.Morph.Case.IsGenitive && npt2.Chars.IsAllLower) { OrgItemTypeToken typ = OrgItemTypeToken.TryAttach(npt.EndToken.Next, true, null); OrgItemEponymToken epo = OrgItemEponymToken.TryAttach(npt.EndToken.Next, false); Pullenti.Ner.ReferentToken rtt = t.Kit.ProcessReferent("PERSONPROPERTY", npt.EndToken.Next); if (typ == null && epo == null && ((rtt == null || rtt.Morph.Number == Pullenti.Morph.MorphNumber.Plural))) { res.EndToken = npt2.EndToken; res.Value = string.Format("{0} {1}", res.Value, Pullenti.Ner.Core.MiscHelper.GetTextValueOfMetaToken(npt2, Pullenti.Ner.Core.GetTextAttr.No)); } } else if (npt.EndToken.Next.IsComma && (npt.EndToken.Next.Next is Pullenti.Ner.TextToken)) { Pullenti.Ner.Token tt2 = npt.EndToken.Next.Next; Pullenti.Morph.MorphClass mv2 = tt2.GetMorphClassInDictionary(); if (mv2.IsAdjective && mv2.IsVerb) { Pullenti.Morph.MorphBaseInfo bi = new Pullenti.Morph.MorphBaseInfo() { Case = npt.Morph.Case, Gender = npt.Morph.Gender, Number = npt.Morph.Number }; if (tt2.Morph.CheckAccord(bi, false, false)) { npt2 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt2.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt2 != null && ((npt2.Morph.Case.IsDative || npt2.Morph.Case.IsGenitive)) && npt2.Chars.IsAllLower) { res.EndToken = npt2.EndToken; res.Value = string.Format("{0} {1}", res.Value, Pullenti.Ner.Core.MiscHelper.GetTextValue(npt.EndToken.Next, res.EndToken, Pullenti.Ner.Core.GetTextAttr.No)); } } } } } if (explOk) { res.IsAfterConjunction = true; } } else if (npt != null && ((((prev != null && prev.IsNounPhrase && npt.Morph.Case.IsInstrumental)) || extOnto))) { res = new OrgItemNameToken(npt.BeginToken, npt.EndToken) { Morph = npt.Morph, Value = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false) }; res.IsNounPhrase = true; } else if (tt.IsAnd) { res = TryAttach(tt.Next, prev, extOnto, false); if (res == null || !res.IsNounPhrase || prev == null) { return(null); } if (((prev.Morph.Case & res.Morph.Case)).IsUndefined) { return(null); } if (prev.Morph.Number != Pullenti.Morph.MorphNumber.Undefined && res.Morph.Number != Pullenti.Morph.MorphNumber.Undefined) { if (((prev.Morph.Number & res.Morph.Number)) == Pullenti.Morph.MorphNumber.Undefined) { if (prev.Chars != res.Chars) { return(null); } OrgItemTypeToken ty = OrgItemTypeToken.TryAttach(res.EndToken.Next, false, null); if (ty != null) { return(null); } } } Pullenti.Morph.CharsInfo ci = res.Chars; res.Chars = ci; res.IsAfterConjunction = true; return(res); } else if (((tt.Term == "ПО" || tt.Term == "ПРИ" || tt.Term == "ЗА") || tt.Term == "С" || tt.Term == "В") || tt.Term == "НА") { npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null) { if (m_VervotWords.TryParse(npt.EndToken, Pullenti.Ner.Core.TerminParseAttr.No) != null) { return(null); } bool ok = false; if (tt.Term == "ПО") { ok = npt.Morph.Case.IsDative; } else if (tt.Term == "С") { ok = npt.Morph.Case.IsInstrumental; } else if (tt.Term == "ЗА") { ok = npt.Morph.Case.IsGenitive | npt.Morph.Case.IsInstrumental; } else if (tt.Term == "НА") { ok = npt.Morph.Case.IsPrepositional; } else if (tt.Term == "В") { ok = npt.Morph.Case.IsDative | npt.Morph.Case.IsPrepositional; if (ok) { ok = false; if (t.Next.IsValue("СФЕРА", null) || t.Next.IsValue("ОБЛАСТЬ", null)) { ok = true; } } } else if (tt.Term == "ПРИ") { ok = npt.Morph.Case.IsPrepositional; if (ok) { if (OrgItemTypeToken.TryAttach(tt.Next, true, null) != null) { ok = false; } else { Pullenti.Ner.ReferentToken rt = tt.Kit.ProcessReferent(Pullenti.Ner.Org.OrganizationAnalyzer.ANALYZER_NAME, tt.Next); if (rt != null) { ok = false; } } } string s = npt.Noun.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); if (s == "ПОДДЕРЖКА" || s == "УЧАСТИЕ") { ok = false; } } else { ok = npt.Morph.Case.IsPrepositional; } if (ok) { res = new OrgItemNameToken(t, npt.EndToken) { Morph = npt.Morph, Value = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false), Chars = npt.Chars }; res.IsNounPhrase = true; res.Preposition = tt.Term; if (((res.Value == "ДЕЛО" || res.Value == "ВОПРОС")) && !res.IsNewlineAfter) { OrgItemNameToken res2 = _TryAttach(res.EndToken.Next, res, extOnto); if (res2 != null && res2.Morph.Case.IsGenitive) { res.Value = string.Format("{0} {1}", res.Value, res2.Value); res.EndToken = res2.EndToken; for (Pullenti.Ner.Token ttt = res2.EndToken.Next; ttt != null; ttt = ttt.Next) { if (!ttt.IsCommaAnd) { break; } OrgItemNameToken res3 = _TryAttach(ttt.Next, res2, extOnto); if (res3 == null) { break; } res.Value = string.Format("{0} {1}", res.Value, res3.Value); res.EndToken = res3.EndToken; if (ttt.IsAnd) { break; } ttt = res.EndToken; } } } } } if (res == null) { return(null); } } else if (tt.Term == "OF") { Pullenti.Ner.Token t1 = tt.Next; if (t1 != null && Pullenti.Ner.Core.MiscHelper.IsEngArticle(t1)) { t1 = t1.Next; } if (t1 != null && t1.Chars.IsLatinLetter && !t1.Chars.IsAllLower) { res = new OrgItemNameToken(t, t1) { Chars = t1.Chars, Morph = t1.Morph }; for (Pullenti.Ner.Token ttt = t1.Next; ttt != null; ttt = ttt.Next) { if (ttt.WhitespacesBeforeCount > 2) { break; } if (Pullenti.Ner.Core.MiscHelper.IsEngAdjSuffix(ttt)) { ttt = ttt.Next; continue; } if (!ttt.Chars.IsLatinLetter) { break; } if (ttt.Morph.Class.IsPreposition) { break; } t1 = (res.EndToken = ttt); } res.Value = Pullenti.Ner.Core.MiscHelper.GetTextValue(t, t1, Pullenti.Ner.Core.GetTextAttr.IgnoreArticles); res.Preposition = tt.Term; return(res); } } if (res == null) { if (tt.Chars.IsLatinLetter && tt.LengthChar == 1) { } else if (tt.Chars.IsAllLower || (tt.LengthChar < 2)) { if (!tt.Chars.IsLatinLetter || prev == null || !prev.Chars.IsLatinLetter) { return(null); } } if (tt.Chars.IsCyrillicLetter) { Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if (mc.IsVerb || mc.IsAdverb) { return(null); } } else if (tt.Chars.IsLatinLetter && !tt.IsWhitespaceAfter) { if (!tt.IsWhitespaceAfter && (tt.LengthChar < 5)) { if (tt.Next is Pullenti.Ner.NumberToken) { return(null); } } } res = new OrgItemNameToken(tt, tt) { Value = tt.Term, Morph = tt.Morph }; for (t = tt.Next; t != null; t = t.Next) { if ((((t.IsHiphen || t.IsCharOf("\\/"))) && t.Next != null && (t.Next is Pullenti.Ner.TextToken)) && !t.IsWhitespaceBefore && !t.IsWhitespaceAfter) { t = t.Next; res.EndToken = t; res.Value = string.Format("{0}{1}{2}", res.Value, (t.Previous.IsChar('.') ? '.' : '-'), (t as Pullenti.Ner.TextToken).Term); } else if (t.IsChar('.')) { if (!t.IsWhitespaceAfter && !t.IsWhitespaceBefore && (t.Next is Pullenti.Ner.TextToken)) { res.EndToken = t.Next; t = t.Next; res.Value = string.Format("{0}.{1}", res.Value, (t as Pullenti.Ner.TextToken).Term); } else if ((t.Next != null && !t.IsNewlineAfter && t.Next.Chars.IsLatinLetter) && tt.Chars.IsLatinLetter) { res.EndToken = t; } else { break; } } else { break; } } } for (Pullenti.Ner.Token t0 = res.BeginToken; t0 != null; t0 = t0.Next) { if ((((tt = t0 as Pullenti.Ner.TextToken))) != null && tt.IsLetters) { if (!tt.Morph.Class.IsConjunction && !tt.Morph.Class.IsPreposition) { foreach (Pullenti.Morph.MorphBaseInfo mf in tt.Morph.Items) { if ((mf as Pullenti.Morph.MorphWordForm).IsInDictionary) { res.IsInDictionary = true; } } } } if (t0 == res.EndToken) { break; } } if (res.BeginToken == res.EndToken && res.BeginToken.Chars.IsAllUpper) { if (res.EndToken.Next != null && !res.EndToken.IsWhitespaceAfter) { Pullenti.Ner.Token t1 = res.EndToken.Next; if (t1.Next != null && !t1.IsWhitespaceAfter && t1.IsHiphen) { t1 = t1.Next; } if (t1 is Pullenti.Ner.NumberToken) { res.Value += (t1 as Pullenti.Ner.NumberToken).Value; res.EndToken = t1; } } } if (res.BeginToken == res.EndToken && res.BeginToken.Chars.IsLastLower) { string src = res.BeginToken.GetSourceText(); for (int i = src.Length - 1; i >= 0; i--) { if (char.IsUpper(src[i])) { res.Value = src.Substring(0, i + 1); break; } } } return(res); }
public static Pullenti.Ner.ReferentToken TryAttachTerritory(List <TerrItemToken> li, Pullenti.Ner.Core.AnalyzerData ad, bool attachAlways = false, List <CityItemToken> cits = null, List <Pullenti.Ner.Geo.GeoReferent> exists = null) { if (li == null || li.Count == 0) { return(null); } TerrItemToken exObj = null; TerrItemToken newName = null; List <TerrItemToken> adjList = new List <TerrItemToken>(); TerrItemToken noun = null; TerrItemToken addNoun = null; Pullenti.Ner.ReferentToken rt = _tryAttachMoscowAO(li, ad); if (rt != null) { return(rt); } if (li[0].TerminItem != null && li[0].TerminItem.CanonicText == "ТЕРРИТОРИЯ") { Pullenti.Ner.ReferentToken res2 = _tryAttachPureTerr(li, ad); return(res2); } if (li.Count == 2) { if (li[0].Rzd != null && li[1].RzdDir != null) { Pullenti.Ner.Geo.GeoReferent rzd = new Pullenti.Ner.Geo.GeoReferent(); rzd.AddName(li[1].RzdDir); rzd.AddTypTer(li[0].Kit.BaseLanguage); rzd.AddSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_REF, li[0].Rzd.Referent, false, 0); rzd.AddExtReferent(li[0].Rzd); return(new Pullenti.Ner.ReferentToken(rzd, li[0].BeginToken, li[1].EndToken)); } if (li[1].Rzd != null && li[0].RzdDir != null) { Pullenti.Ner.Geo.GeoReferent rzd = new Pullenti.Ner.Geo.GeoReferent(); rzd.AddName(li[0].RzdDir); rzd.AddTypTer(li[0].Kit.BaseLanguage); rzd.AddSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_REF, li[1].Rzd.Referent, false, 0); rzd.AddExtReferent(li[1].Rzd); return(new Pullenti.Ner.ReferentToken(rzd, li[0].BeginToken, li[1].EndToken)); } } bool canBeCityBefore = false; bool adjTerrBefore = false; if (cits != null) { if (cits[0].Typ == CityItemToken.ItemType.City) { canBeCityBefore = true; } else if (cits[0].Typ == CityItemToken.ItemType.Noun && cits.Count > 1) { canBeCityBefore = true; } } int k; for (k = 0; k < li.Count; k++) { if (li[k].OntoItem != null) { if (exObj != null || newName != null) { break; } if (noun != null) { if (k == 1) { if (noun.TerminItem.CanonicText == "РАЙОН" || noun.TerminItem.CanonicText == "ОБЛАСТЬ" || noun.TerminItem.CanonicText == "СОЮЗ") { if (li[k].OntoItem.Referent is Pullenti.Ner.Geo.GeoReferent) { if ((li[k].OntoItem.Referent as Pullenti.Ner.Geo.GeoReferent).IsState) { break; } } bool ok = false; Pullenti.Ner.Token tt = li[k].EndToken.Next; if (tt == null) { ok = true; } else if (tt.IsCharOf(",.")) { ok = true; } if (!ok) { ok = MiscLocationHelper.CheckGeoObjectBefore(li[0].BeginToken); } if (!ok) { Pullenti.Ner.Address.Internal.AddressItemToken adr = Pullenti.Ner.Address.Internal.AddressItemToken.TryParse(tt, null, false, false, null); if (adr != null) { if (adr.Typ == Pullenti.Ner.Address.Internal.AddressItemToken.ItemType.Street) { ok = true; } } } if (!ok) { break; } } if (li[k].OntoItem != null) { if (noun.BeginToken.IsValue("МО", null) || noun.BeginToken.IsValue("ЛО", null)) { return(null); } } } } exObj = li[k]; } else if (li[k].TerminItem != null) { if (noun != null) { break; } if (li[k].TerminItem.IsAlwaysPrefix && k > 0) { break; } if (k > 0 && li[k].IsDoubt) { if (li[k].BeginToken == li[k].EndToken && li[k].BeginToken.IsValue("ЗАО", null)) { break; } } if (li[k].TerminItem.IsAdjective || li[k].IsGeoInDictionary) { adjList.Add(li[k]); } else { if (exObj != null) { Pullenti.Ner.Geo.GeoReferent geo = exObj.OntoItem.Referent as Pullenti.Ner.Geo.GeoReferent; if (geo == null) { break; } if (exObj.IsAdjective && ((li[k].TerminItem.CanonicText == "СОЮЗ" || li[k].TerminItem.CanonicText == "ФЕДЕРАЦИЯ"))) { string str = exObj.OntoItem.ToString(); if (!str.Contains(li[k].TerminItem.CanonicText)) { return(null); } } if (li[k].TerminItem.CanonicText == "РАЙОН" || li[k].TerminItem.CanonicText == "ОКРУГ" || li[k].TerminItem.CanonicText == "КРАЙ") { StringBuilder tmp = new StringBuilder(); foreach (Pullenti.Ner.Slot s in geo.Slots) { if (s.TypeName == Pullenti.Ner.Geo.GeoReferent.ATTR_TYPE) { tmp.AppendFormat("{0};", s.Value); } } if (!tmp.ToString().ToUpper().Contains(li[k].TerminItem.CanonicText)) { if (k != 1 || newName != null) { break; } newName = li[0]; newName.IsAdjective = true; newName.OntoItem = null; exObj = null; } } } noun = li[k]; if (k == 0) { TerrItemToken tt = TerrItemToken.TryParse(li[k].BeginToken.Previous, null, true, false, null); if (tt != null && tt.Morph.Class.IsAdjective) { adjTerrBefore = true; } } } } else { if (exObj != null) { break; } if (newName != null) { break; } newName = li[k]; } } string name = null; string altName = null; string fullName = null; Pullenti.Ner.MorphCollection morph = null; if (exObj != null) { if (exObj.IsAdjective && !exObj.Morph.Language.IsEn && noun == null) { if (attachAlways && exObj.EndToken.Next != null) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(exObj.BeginToken, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (exObj.EndToken.Next.IsCommaAnd) { } else if (npt == null) { } else { Pullenti.Ner.Address.Internal.StreetItemToken str = Pullenti.Ner.Address.Internal.StreetItemToken.TryParse(exObj.EndToken.Next, null, false, null, false); if (str != null) { if (str.Typ == Pullenti.Ner.Address.Internal.StreetItemType.Noun && str.EndToken == npt.EndToken) { return(null); } } } } else { CityItemToken cit = CityItemToken.TryParse(exObj.EndToken.Next, null, false, null); if (cit != null && ((cit.Typ == CityItemToken.ItemType.Noun || cit.Typ == CityItemToken.ItemType.City))) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(exObj.BeginToken, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.EndToken == cit.EndToken) { } else { return(null); } } else if (exObj.BeginToken.IsValue("ПОДНЕБЕСНЫЙ", null)) { } else { return(null); } } } if (noun == null && exObj.CanBeCity) { CityItemToken cit0 = CityItemToken.TryParseBack(exObj.BeginToken.Previous); if (cit0 != null && cit0.Typ != CityItemToken.ItemType.ProperName) { return(null); } } if (exObj.IsDoubt && noun == null) { bool ok2 = false; if (_canBeGeoAfter(exObj.EndToken.Next)) { ok2 = true; } else if (!exObj.CanBeSurname && !exObj.CanBeCity) { if ((exObj.EndToken.Next != null && exObj.EndToken.Next.IsChar(')') && exObj.BeginToken.Previous != null) && exObj.BeginToken.Previous.IsChar('(')) { ok2 = true; } else if (exObj.Chars.IsLatinLetter && exObj.BeginToken.Previous != null) { if (exObj.BeginToken.Previous.IsValue("IN", null)) { ok2 = true; } else if (exObj.BeginToken.Previous.IsValue("THE", null) && exObj.BeginToken.Previous.Previous != null && exObj.BeginToken.Previous.Previous.IsValue("IN", null)) { ok2 = true; } } } if (!ok2) { CityItemToken cit0 = CityItemToken.TryParseBack(exObj.BeginToken.Previous); if (cit0 != null && cit0.Typ != CityItemToken.ItemType.ProperName) { } else if (MiscLocationHelper.CheckGeoObjectBefore(exObj.BeginToken.Previous)) { } else { return(null); } } } name = exObj.OntoItem.CanonicText; morph = exObj.Morph; } else if (newName != null) { if (noun == null) { return(null); } for (int j = 1; j < k; j++) { if (li[j].IsNewlineBefore && !li[0].IsNewlineBefore) { if (Pullenti.Ner.Core.BracketHelper.CanBeStartOfSequence(li[j].BeginToken, false, false)) { } else { return(null); } } } morph = noun.Morph; if (newName.IsAdjective) { if (noun.TerminItem.Acronym == "АО") { if (noun.BeginToken != noun.EndToken) { return(null); } if (newName.Morph.Gender != Pullenti.Morph.MorphGender.Feminie) { return(null); } } Pullenti.Ner.Geo.GeoReferent geoBefore = null; Pullenti.Ner.Token tt0 = li[0].BeginToken.Previous; if (tt0 != null && tt0.IsCommaAnd) { tt0 = tt0.Previous; } if (!li[0].IsNewlineBefore && tt0 != null) { geoBefore = tt0.GetReferent() as Pullenti.Ner.Geo.GeoReferent; } if (li.IndexOf(noun) < li.IndexOf(newName)) { if (noun.TerminItem.IsState) { return(null); } if (newName.CanBeSurname && geoBefore == null) { if (((noun.Morph.Case & newName.Morph.Case)).IsUndefined) { return(null); } } if (Pullenti.Ner.Core.MiscHelper.IsExistsInDictionary(newName.BeginToken, newName.EndToken, Pullenti.Morph.MorphClass.Adjective | Pullenti.Morph.MorphClass.Pronoun | Pullenti.Morph.MorphClass.Verb)) { if (noun.BeginToken != newName.BeginToken) { if (geoBefore == null) { if (li.Count == 2 && _canBeGeoAfter(li[1].EndToken.Next)) { } else if (li.Count == 3 && li[2].TerminItem != null && _canBeGeoAfter(li[2].EndToken.Next)) { } else if (newName.IsGeoInDictionary) { } else if (newName.EndToken.IsNewlineAfter) { } else { return(null); } } } } Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(newName.EndToken, Pullenti.Ner.Core.NounPhraseParseAttr.ParsePronouns, 0, null); if (npt != null && npt.EndToken != newName.EndToken) { if (li.Count >= 3 && li[2].TerminItem != null && npt.EndToken == li[2].EndToken) { addNoun = li[2]; } else { return(null); } } Pullenti.Ner.ReferentToken rtp = newName.Kit.ProcessReferent("PERSON", newName.BeginToken); if (rtp != null) { return(null); } name = Pullenti.Ner.Core.ProperNameHelper.GetNameEx(newName.BeginToken, newName.EndToken, Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphCase.Undefined, noun.TerminItem.Gender, false, false); } else { bool ok = false; if (((k + 1) < li.Count) && li[k].TerminItem == null && li[k + 1].TerminItem != null) { ok = true; } else if ((k < li.Count) && li[k].OntoItem != null) { ok = true; } else if (k == li.Count && !newName.IsAdjInDictionary) { ok = true; } else if (MiscLocationHelper.CheckGeoObjectBefore(li[0].BeginToken) || canBeCityBefore) { ok = true; } else if (MiscLocationHelper.CheckGeoObjectAfter(li[k - 1].EndToken, false)) { ok = true; } else if (li.Count == 3 && k == 2) { CityItemToken cit = CityItemToken.TryParse(li[2].BeginToken, null, false, null); if (cit != null) { if (cit.Typ == CityItemToken.ItemType.City || cit.Typ == CityItemToken.ItemType.Noun) { ok = true; } } } else if (li.Count == 2) { ok = _canBeGeoAfter(li[li.Count - 1].EndToken.Next); } if (!ok && !li[0].IsNewlineBefore && !li[0].Chars.IsAllLower) { Pullenti.Ner.ReferentToken rt00 = li[0].Kit.ProcessReferent("PERSONPROPERTY", li[0].BeginToken.Previous); if (rt00 != null) { ok = true; } } if (noun.TerminItem != null && noun.TerminItem.IsStrong && newName.IsAdjective) { ok = true; } if (noun.IsDoubt && adjList.Count == 0 && geoBefore == null) { return(null); } name = Pullenti.Ner.Core.ProperNameHelper.GetNameEx(newName.BeginToken, newName.EndToken, Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphCase.Undefined, noun.TerminItem.Gender, false, false); if (!ok && !attachAlways) { if (Pullenti.Ner.Core.MiscHelper.IsExistsInDictionary(newName.BeginToken, newName.EndToken, Pullenti.Morph.MorphClass.Adjective | Pullenti.Morph.MorphClass.Pronoun | Pullenti.Morph.MorphClass.Verb)) { if (exists != null) { foreach (Pullenti.Ner.Geo.GeoReferent e in exists) { if (e.FindSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_NAME, name, true) != null) { ok = true; break; } } } if (!ok) { return(null); } } } fullName = string.Format("{0} {1}", Pullenti.Ner.Core.ProperNameHelper.GetNameEx(li[0].BeginToken, noun.BeginToken.Previous, Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphCase.Undefined, noun.TerminItem.Gender, false, false), noun.TerminItem.CanonicText); } } else { if (!attachAlways || ((noun.TerminItem != null && noun.TerminItem.CanonicText == "ФЕДЕРАЦИЯ"))) { bool isLatin = noun.Chars.IsLatinLetter && newName.Chars.IsLatinLetter; if (li.IndexOf(noun) > li.IndexOf(newName)) { if (!isLatin) { return(null); } } if (!newName.IsDistrictName && !Pullenti.Ner.Core.BracketHelper.CanBeStartOfSequence(newName.BeginToken, false, false)) { if (adjList.Count == 0 && Pullenti.Ner.Core.MiscHelper.IsExistsInDictionary(newName.BeginToken, newName.EndToken, Pullenti.Morph.MorphClass.Noun | Pullenti.Morph.MorphClass.Pronoun)) { if (li.Count == 2 && noun.IsCityRegion && (noun.WhitespacesAfterCount < 2)) { } else { return(null); } } if (!isLatin) { if ((noun.TerminItem.IsRegion && !attachAlways && ((!adjTerrBefore || newName.IsDoubt))) && !noun.IsCityRegion && !noun.TerminItem.IsSpecificPrefix) { if (!MiscLocationHelper.CheckGeoObjectBefore(noun.BeginToken)) { if (!noun.IsDoubt && noun.BeginToken != noun.EndToken) { } else if ((noun.TerminItem.IsAlwaysPrefix && li.Count == 2 && li[0] == noun) && li[1] == newName) { } else { return(null); } } } if (noun.IsDoubt && adjList.Count == 0) { if (noun.TerminItem.Acronym == "МО" || noun.TerminItem.Acronym == "ЛО") { if (k == (li.Count - 1) && li[k].TerminItem != null) { addNoun = li[k]; k++; } else if (li.Count == 2 && noun == li[0] && newName.ToString().EndsWith("совет")) { } else { return(null); } } else { return(null); } } Pullenti.Ner.ReferentToken pers = newName.Kit.ProcessReferent("PERSON", newName.BeginToken); if (pers != null) { return(null); } } } } name = Pullenti.Ner.Core.MiscHelper.GetTextValue(newName.BeginToken, newName.EndToken, Pullenti.Ner.Core.GetTextAttr.No); if (newName.BeginToken != newName.EndToken) { for (Pullenti.Ner.Token ttt = newName.BeginToken.Next; ttt != null && ttt.EndChar <= newName.EndChar; ttt = ttt.Next) { if (ttt.Chars.IsLetter) { TerrItemToken ty = TerrItemToken.TryParse(ttt, null, false, false, null); if ((ty != null && ty.TerminItem != null && noun != null) && ((ty.TerminItem.CanonicText.Contains(noun.TerminItem.CanonicText) || noun.TerminItem.CanonicText.Contains(ty.TerminItem.CanonicText)))) { name = Pullenti.Ner.Core.MiscHelper.GetTextValue(newName.BeginToken, ttt.Previous, Pullenti.Ner.Core.GetTextAttr.No); break; } } } } if (adjList.Count > 0) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(adjList[0].BeginToken, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.EndToken == noun.EndToken) { altName = string.Format("{0} {1}", npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false), name); } } } } else { if ((li.Count == 1 && noun != null && noun.EndToken.Next != null) && (noun.EndToken.Next.GetReferent() is Pullenti.Ner.Geo.GeoReferent)) { Pullenti.Ner.Geo.GeoReferent g = noun.EndToken.Next.GetReferent() as Pullenti.Ner.Geo.GeoReferent; if (noun.TerminItem != null) { string tyy = noun.TerminItem.CanonicText.ToLower(); bool ooo = false; if (g.FindSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_TYPE, tyy, true) != null) { ooo = true; } else if (tyy.EndsWith("район") && g.FindSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_TYPE, "район", true) != null) { ooo = true; } if (ooo) { return new Pullenti.Ner.ReferentToken(g, noun.BeginToken, noun.EndToken.Next) { Morph = noun.BeginToken.Morph } } ; } } if ((li.Count == 1 && noun == li[0] && li[0].TerminItem != null) && TerrItemToken.TryParse(li[0].EndToken.Next, null, true, false, null) == null && TerrItemToken.TryParse(li[0].BeginToken.Previous, null, true, false, null) == null) { if (li[0].Morph.Number == Pullenti.Morph.MorphNumber.Plural) { return(null); } int cou = 0; string str = li[0].TerminItem.CanonicText.ToLower(); for (Pullenti.Ner.Token tt = li[0].BeginToken.Previous; tt != null; tt = tt.Previous) { if (tt.IsNewlineAfter) { cou += 10; } else { cou++; } if (cou > 500) { break; } Pullenti.Ner.Geo.GeoReferent g = tt.GetReferent() as Pullenti.Ner.Geo.GeoReferent; if (g == null) { continue; } bool ok = true; cou = 0; for (tt = li[0].EndToken.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { cou += 10; } else { cou++; } if (cou > 500) { break; } TerrItemToken tee = TerrItemToken.TryParse(tt, null, true, false, null); if (tee == null) { continue; } ok = false; break; } if (ok) { for (int ii = 0; g != null && (ii < 3); g = g.Higher, ii++) { if (g.FindSlot(Pullenti.Ner.Geo.GeoReferent.ATTR_TYPE, str, true) != null) { return new Pullenti.Ner.ReferentToken(g, li[0].BeginToken, li[0].EndToken) { Morph = noun.BeginToken.Morph } } ; } } break; } } return(null); } Pullenti.Ner.Geo.GeoReferent ter = null; if (exObj != null && (exObj.Tag is Pullenti.Ner.Geo.GeoReferent)) { ter = exObj.Tag as Pullenti.Ner.Geo.GeoReferent; } else { ter = new Pullenti.Ner.Geo.GeoReferent(); if (exObj != null) { Pullenti.Ner.Geo.GeoReferent geo = exObj.OntoItem.Referent as Pullenti.Ner.Geo.GeoReferent; if (geo != null && !geo.IsCity) { ter.MergeSlots2(geo, li[0].Kit.BaseLanguage); } else { ter.AddName(name); } if (noun == null && exObj.CanBeCity) { ter.AddTypCity(li[0].Kit.BaseLanguage); } else { } } else if (newName != null) { ter.AddName(name); if (altName != null) { ter.AddName(altName); } } if (noun != null) { if (noun.TerminItem.CanonicText == "АО") { ter.AddTyp((li[0].Kit.BaseLanguage.IsUa ? "АВТОНОМНИЙ ОКРУГ" : "АВТОНОМНЫЙ ОКРУГ")); } else if (noun.TerminItem.CanonicText == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ" || noun.TerminItem.CanonicText == "МУНІЦИПАЛЬНЕ ЗБОРИ") { ter.AddTyp((li[0].Kit.BaseLanguage.IsUa ? "МУНІЦИПАЛЬНЕ УТВОРЕННЯ" : "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ")); } else if (noun.TerminItem.Acronym == "МО" && addNoun != null) { ter.AddTyp(addNoun.TerminItem.CanonicText); } else { if (noun.TerminItem.CanonicText == "СОЮЗ" && exObj != null && exObj.EndChar > noun.EndChar) { return new Pullenti.Ner.ReferentToken(ter, exObj.BeginToken, exObj.EndToken) { Morph = exObj.Morph } } ; ter.AddTyp(noun.TerminItem.CanonicText); if (noun.TerminItem.IsRegion && ter.IsState) { ter.AddTypReg(li[0].Kit.BaseLanguage); } } } if (ter.IsState && ter.IsRegion) { foreach (TerrItemToken a in adjList) { if (a.TerminItem.IsRegion) { ter.AddTypReg(li[0].Kit.BaseLanguage); break; } } } if (ter.IsState) { if (fullName != null) { ter.AddName(fullName); } } } Pullenti.Ner.ReferentToken res = new Pullenti.Ner.ReferentToken(ter, li[0].BeginToken, li[k - 1].EndToken); if (noun != null && noun.Morph.Class.IsNoun) { res.Morph = noun.Morph; } else { res.Morph = new Pullenti.Ner.MorphCollection(); for (int ii = 0; ii < k; ii++) { foreach (Pullenti.Morph.MorphBaseInfo v in li[ii].Morph.Items) { Pullenti.Morph.MorphBaseInfo bi = new Pullenti.Morph.MorphBaseInfo(); bi.CopyFrom(v); if (noun != null) { if (bi.Class.IsAdjective) { bi.Class = Pullenti.Morph.MorphClass.Noun; } } res.Morph.AddItem(bi); } } } if (li[0].TerminItem != null && li[0].TerminItem.IsSpecificPrefix) { res.BeginToken = li[0].EndToken.Next; } if (addNoun != null && addNoun.EndChar > res.EndChar) { res.EndToken = addNoun.EndToken; } if ((res.BeginToken.Previous is Pullenti.Ner.TextToken) && (res.WhitespacesBeforeCount < 2)) { Pullenti.Ner.TextToken tt = res.BeginToken.Previous as Pullenti.Ner.TextToken; if (tt.Term == "АР") { foreach (string ty in ter.Typs) { if (ty.Contains("республика") || ty.Contains("республіка")) { res.BeginToken = tt; break; } } } } return(res); }