bool CalcRankAndValue(int minNewlinesCount) { Rank = 0; if (BeginToken.Chars.IsAllLower) { Rank -= 30; } int words = 0; int upWords = 0; int notwords = 0; int lineNumber = 0; Pullenti.Ner.Token tstart = BeginToken; Pullenti.Ner.Token tend = EndToken; for (Pullenti.Ner.Token t = BeginToken; t != EndToken.Next && t != null && t.EndChar <= EndToken.EndChar; t = t.Next) { if (t.IsNewlineBefore) { } TitleItemToken tit = TitleItemToken.TryAttach(t); if (tit != null) { if (tit.Typ == TitleItemToken.Types.Theme || tit.Typ == TitleItemToken.Types.TypAndTheme) { if (t != BeginToken) { if (lineNumber > 0) { return(false); } words = (upWords = (notwords = 0)); tstart = tit.EndToken.Next; } t = tit.EndToken; if (t.Next == null) { return(false); } if (t.Next.Chars.IsLetter && t.Next.Chars.IsAllLower) { Rank += 20; } else { Rank += 100; } tstart = t.Next; if (tit.Typ == TitleItemToken.Types.TypAndTheme) { TypeValue = tit.Value; } continue; } if (tit.Typ == TitleItemToken.Types.Typ) { if (t == BeginToken) { if (tit.EndToken.IsNewlineAfter) { TypeValue = tit.Value; Rank += 5; tstart = tit.EndToken.Next; } } t = tit.EndToken; words++; if (tit.BeginToken != tit.EndToken) { words++; } if (tit.Chars.IsAllUpper) { upWords++; } continue; } if (tit.Typ == TitleItemToken.Types.Dust || tit.Typ == TitleItemToken.Types.Speciality) { if (t == BeginToken) { return(false); } Rank -= 20; if (tit.Typ == TitleItemToken.Types.Speciality) { Speciality = tit.Value; } t = tit.EndToken; continue; } if (tit.Typ == TitleItemToken.Types.Consultant || tit.Typ == TitleItemToken.Types.Boss || tit.Typ == TitleItemToken.Types.Editor) { t = tit.EndToken; if (t.Next != null && ((t.Next.IsCharOf(":") || t.Next.IsHiphen || t.WhitespacesAfterCount > 4))) { Rank -= 10; } else { Rank -= 2; } continue; } return(false); } Pullenti.Ner.Booklink.Internal.BookLinkToken blt = Pullenti.Ner.Booklink.Internal.BookLinkToken.TryParse(t, 0); if (blt != null) { if (blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.Misc || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.N || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.Pages) { Rank -= 10; } else if (blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.N || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.PageRange) { Rank -= 20; } } if (t == BeginToken && Pullenti.Ner.Booklink.Internal.BookLinkToken.TryParseAuthor(t, Pullenti.Ner.Person.Internal.FioTemplateType.Undefined) != null) { Rank -= 20; } if (t.IsNewlineBefore && t != BeginToken) { lineNumber++; if (lineNumber > 4) { return(false); } if (t.Chars.IsAllLower) { Rank += 10; } else if (t.Previous.IsChar('.')) { Rank -= 10; } else if (t.Previous.IsCharOf(",-")) { Rank += 10; } else { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Previous, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.EndChar >= t.EndChar) { Rank += 10; } } } if (t != BeginToken && t.NewlinesBeforeCount > minNewlinesCount) { Rank -= (t.NewlinesBeforeCount - minNewlinesCount); } Pullenti.Ner.Core.BracketSequenceToken bst = Pullenti.Ner.Core.BracketHelper.TryParse(t, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (bst != null && bst.IsQuoteType && bst.EndToken.EndChar <= EndToken.EndChar) { if (words == 0) { tstart = bst.BeginToken; Rank += 10; if (bst.EndToken == EndToken) { tend = EndToken; Rank += 10; } } } List <Pullenti.Ner.Referent> rli = t.GetReferents(); if (rli != null) { foreach (Pullenti.Ner.Referent r in rli) { if (r is Pullenti.Ner.Org.OrganizationReferent) { if (t.IsNewlineBefore) { Rank -= 10; } else { Rank -= 4; } continue; } if ((r is Pullenti.Ner.Geo.GeoReferent) || (r is Pullenti.Ner.Person.PersonReferent)) { if (t.IsNewlineBefore) { Rank -= 5; if (t.IsNewlineAfter || t.Next == null) { Rank -= 20; } else if (t.Next.IsHiphen || (t.Next is Pullenti.Ner.NumberToken) || (t.Next.GetReferent() is Pullenti.Ner.Date.DateReferent)) { Rank -= 20; } else if (t != BeginToken) { Rank -= 20; } } continue; } if ((r is Pullenti.Ner.Geo.GeoReferent) || (r is Pullenti.Ner.Denomination.DenominationReferent)) { continue; } if ((r is Pullenti.Ner.Uri.UriReferent) || (r is Pullenti.Ner.Phone.PhoneReferent)) { return(false); } if (t.IsNewlineBefore) { Rank -= 4; } else { Rank -= 2; } if (t == BeginToken && (EndToken.GetReferent() is Pullenti.Ner.Person.PersonReferent)) { Rank -= 10; } } words++; if (t.Chars.IsAllUpper) { upWords++; } if (t == BeginToken) { if (t.IsNewlineAfter) { Rank -= 10; } else if (t.Next != null && t.Next.IsChar('.') && t.Next.IsNewlineAfter) { Rank -= 10; } } continue; } if (t is Pullenti.Ner.NumberToken) { if ((t as Pullenti.Ner.NumberToken).Typ == Pullenti.Ner.NumberSpellingType.Words) { words++; if (t.Chars.IsAllUpper) { upWords++; } } else { notwords++; } continue; } Pullenti.Ner.Person.Internal.PersonAttrToken pat = Pullenti.Ner.Person.Internal.PersonAttrToken.TryAttach(t, null, Pullenti.Ner.Person.Internal.PersonAttrToken.PersonAttrAttachAttrs.No); if (pat != null) { if (t.IsNewlineBefore) { if (!pat.Morph.Case.IsUndefined && !pat.Morph.Case.IsNominative) { } else if (pat.Chars.IsAllUpper) { } else { Rank -= 20; } } else if (t.Chars.IsAllLower) { Rank--; } for (; t != null; t = t.Next) { words++; if (t.Chars.IsAllUpper) { upWords++; } if (t == pat.EndToken) { break; } } continue; } Pullenti.Ner.Org.Internal.OrgItemTypeToken oitt = Pullenti.Ner.Org.Internal.OrgItemTypeToken.TryAttach(t, true, null); if (oitt != null) { if (oitt.Morph.Number != Pullenti.Morph.MorphNumber.Plural && !oitt.IsDoubtRootWord) { if (!oitt.Morph.Case.IsUndefined && !oitt.Morph.Case.IsNominative) { words++; if (t.Chars.IsAllUpper) { upWords++; } } else { Rank -= 4; if (t == BeginToken) { Rank -= 5; } } } else { words += 1; if (t.Chars.IsAllUpper) { upWords++; } } t = oitt.EndToken; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { if (tt.IsChar('©')) { Rank -= 10; } if (tt.IsChar('_')) { Rank--; } if (tt.Chars.IsLetter) { if (tt.LengthChar > 2) { words++; if (t.Chars.IsAllUpper) { upWords++; } } } else if (!tt.IsChar(',')) { notwords++; } if (tt.IsPureVerb) { { Rank -= 30; words--; } break; } if (tt == EndToken) { if (tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction) { Rank -= 10; } else if (tt.IsChar('.')) { Rank += 5; } } else if (tt.IsCharOf("._")) { Rank -= 5; } } } Rank += words; Rank -= notwords; if ((words < 1) && (Rank < 50)) { return(false); } if (tstart == null || tend == null) { return(false); } if (tstart.EndChar > tend.EndChar) { return(false); } TitleItemToken tit1 = TitleItemToken.TryAttach(EndToken.Next); if (tit1 != null && ((tit1.Typ == TitleItemToken.Types.Typ || tit1.Typ == TitleItemToken.Types.Speciality))) { if (tit1.EndToken.IsNewlineAfter) { Rank += 15; } else { Rank += 10; } if (tit1.Typ == TitleItemToken.Types.Speciality) { Speciality = tit1.Value; } } if (upWords > 4 && upWords > ((int)((0.8 * words)))) { if (tstart.Previous != null && (tstart.Previous.GetReferent() is Pullenti.Ner.Person.PersonReferent)) { Rank += (5 + upWords); } } BeginNameToken = tstart; EndNameToken = tend; return(true); }
internal static TitlePageReferent _process(Pullenti.Ner.Token begin, int maxCharPos, Pullenti.Ner.Core.AnalysisKit kit, out Pullenti.Ner.Token endToken) { endToken = begin; TitlePageReferent res = new TitlePageReferent(); Pullenti.Ner.Core.Termin term = null; List <Pullenti.Ner.Titlepage.Internal.Line> lines = Pullenti.Ner.Titlepage.Internal.Line.Parse(begin, 30, 1500, maxCharPos); if (lines.Count < 1) { return(null); } int cou = lines.Count; int minNewlinesCount = 10; Dictionary <int, int> linesCountStat = new Dictionary <int, int>(); for (int i = 0; i < lines.Count; i++) { if (Pullenti.Ner.Titlepage.Internal.TitleNameToken.CanBeStartOfTextOrContent(lines[i].BeginToken, lines[i].EndToken)) { cou = i; break; } int j = lines[i].NewlinesBeforeCount; if (i > 0 && j > 0) { if (!linesCountStat.ContainsKey(j)) { linesCountStat.Add(j, 1); } else { linesCountStat[j]++; } } } int max = 0; foreach (KeyValuePair <int, int> kp in linesCountStat) { if (kp.Value > max) { max = kp.Value; minNewlinesCount = kp.Key; } } int endChar = (cou > 0 ? lines[cou - 1].EndChar : 0); if (maxCharPos > 0 && endChar > maxCharPos) { endChar = maxCharPos; } List <Pullenti.Ner.Titlepage.Internal.TitleNameToken> names = new List <Pullenti.Ner.Titlepage.Internal.TitleNameToken>(); for (int i = 0; i < cou; i++) { if (i == 6) { } for (int j = i; (j < cou) && (j < (i + 5)); j++) { if (i == 6 && j == 8) { } if (j > i) { if (lines[j - 1].IsPureEn && lines[j].IsPureRu) { break; } if (lines[j - 1].IsPureRu && lines[j].IsPureEn) { break; } if (lines[j].NewlinesBeforeCount >= (minNewlinesCount * 2)) { break; } } Pullenti.Ner.Titlepage.Internal.TitleNameToken ttt = Pullenti.Ner.Titlepage.Internal.TitleNameToken.TryParse(lines[i].BeginToken, lines[j].EndToken, minNewlinesCount); if (ttt != null) { if (lines[i].IsPureEn) { ttt.Morph.Language = Pullenti.Morph.MorphLang.EN; } else if (lines[i].IsPureRu) { ttt.Morph.Language = Pullenti.Morph.MorphLang.RU; } names.Add(ttt); } } } Pullenti.Ner.Titlepage.Internal.TitleNameToken.Sort(names); Pullenti.Ner.ReferentToken nameRt = null; if (names.Count > 0) { int i0 = 0; if (names[i0].Morph.Language.IsEn) { for (int ii = 1; ii < names.Count; ii++) { if (names[ii].Morph.Language.IsRu && names[ii].Rank > 0) { i0 = ii; break; } } } term = res.AddName(names[i0].BeginNameToken, names[i0].EndNameToken); if (names[i0].TypeValue != null) { res.AddType(names[i0].TypeValue); } if (names[i0].Speciality != null) { res.Speciality = names[i0].Speciality; } Pullenti.Ner.ReferentToken rt = new Pullenti.Ner.ReferentToken(res, names[i0].BeginToken, names[i0].EndToken); if (kit != null) { kit.EmbedToken(rt); } else { res.AddOccurence(new Pullenti.Ner.TextAnnotation(rt.BeginToken, rt.EndToken)); } endToken = rt.EndToken; nameRt = rt; if (begin.BeginChar == rt.BeginChar) { begin = rt; } } if (term != null && kit != null) { for (Pullenti.Ner.Token t = kit.FirstToken; t != null; t = t.Next) { Pullenti.Ner.Core.TerminToken tok = term.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (tok == null) { continue; } Pullenti.Ner.Token t0 = t; Pullenti.Ner.Token t1 = tok.EndToken; if (t1.Next != null && t1.Next.IsChar('.')) { t1 = t1.Next; } if (Pullenti.Ner.Core.BracketHelper.CanBeStartOfSequence(t0.Previous, false, false) && Pullenti.Ner.Core.BracketHelper.CanBeEndOfSequence(t1.Next, false, null, false)) { t0 = t0.Previous; t1 = t1.Next; } Pullenti.Ner.ReferentToken rt = new Pullenti.Ner.ReferentToken(res, t0, t1); kit.EmbedToken(rt); t = rt; } } Pullenti.Ner.Titlepage.Internal.PersonRelations pr = new Pullenti.Ner.Titlepage.Internal.PersonRelations(); Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined; List <Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types> persTypes = pr.RelTypes; for (Pullenti.Ner.Token t = begin; t != null; t = t.Next) { if (maxCharPos > 0 && t.BeginChar > maxCharPos) { break; } if (t == nameRt) { continue; } Pullenti.Ner.Titlepage.Internal.TitleItemToken tpt = Pullenti.Ner.Titlepage.Internal.TitleItemToken.TryAttach(t); if (tpt != null) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined; if (tpt.Typ == Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Typ) { if (res.Types.Count == 0) { res.AddType(tpt.Value); } else if (res.Types.Count == 1) { string ty = res.Types[0].ToUpper(); if (ty == "РЕФЕРАТ") { res.AddType(tpt.Value); } else if (ty == "АВТОРЕФЕРАТ") { if (tpt.Value == "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатской диссертации", true, 0); } else if (tpt.Value == "ДОКТОРСКАЯ ДИССЕРТАЦИЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат докторской диссертации", true, 0); } else if (tpt.Value == "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат магистерской диссертации", true, 0); } else if (tpt.Value == "КАНДИДАТСЬКА ДИСЕРТАЦІЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатської дисертації", true, 0); } else if (tpt.Value == "ДОКТОРСЬКА ДИСЕРТАЦІЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат докторської дисертації", true, 0); } else if (tpt.Value == "МАГІСТЕРСЬКА ДИСЕРТАЦІЯ") { res.AddSlot(TitlePageReferent.ATTR_TYPE, "автореферат магістерської дисертації", true, 0); } else { res.AddType(tpt.Value); } } else if (tpt.Value == "РЕФЕРАТ" || tpt.Value == "АВТОРЕФЕРАТ") { if (!ty.Contains(tpt.Value)) { res.AddType(tpt.Value); } } } } else if (tpt.Typ == Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Speciality) { if (res.Speciality == null) { res.Speciality = tpt.Value; } } else if (persTypes.Contains(tpt.Typ)) { persTyp = tpt.Typ; } t = tpt.EndToken; if (t.EndChar > endToken.EndChar) { endToken = t; } if (t.Next != null && t.Next.IsCharOf(":-")) { t = t.Next; } continue; } if (t.EndChar > endChar) { break; } List <Pullenti.Ner.Referent> rli = t.GetReferents(); if (rli == null) { continue; } if (!t.IsNewlineBefore && (t.Previous is Pullenti.Ner.TextToken)) { string s = (t.Previous as Pullenti.Ner.TextToken).Term; if (s == "ИМЕНИ" || s == "ИМ") { continue; } if (s == "." && t.Previous.Previous != null && t.Previous.Previous.IsValue("ИМ", null)) { continue; } } foreach (Pullenti.Ner.Referent r in rli) { if (r is Pullenti.Ner.Person.PersonReferent) { if (r != rli[0]) { continue; } Pullenti.Ner.Person.PersonReferent p = r as Pullenti.Ner.Person.PersonReferent; if (persTyp != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { if (t.Previous != null && t.Previous.IsChar('.')) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined; } } Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types typ = pr.CalcTypFromAttrs(p); if (typ != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { pr.Add(p, typ, 1); persTyp = typ; } else if (persTyp != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { pr.Add(p, persTyp, 1); } else if (t.Previous != null && t.Previous.IsChar('©')) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Worker; pr.Add(p, persTyp, 1); } else { for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { Pullenti.Ner.Referent rr = tt.GetReferent(); if (rr == res) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Worker; break; } if (rr is Pullenti.Ner.Person.PersonReferent) { if (pr.CalcTypFromAttrs(r as Pullenti.Ner.Person.PersonReferent) != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { break; } else { continue; } } if (rr != null) { break; } tpt = Pullenti.Ner.Titlepage.Internal.TitleItemToken.TryAttach(tt); if (tpt != null) { if (tpt.Typ != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Typ && tpt.Typ != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.TypAndTheme) { break; } tt = tpt.EndToken; if (tt.EndChar > endToken.EndChar) { endToken = tt; } continue; } } if (persTyp == Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { for (Pullenti.Ner.Token tt = t.Previous; tt != null; tt = tt.Previous) { Pullenti.Ner.Referent rr = tt.GetReferent(); if (rr == res) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Worker; break; } if (rr != null) { break; } if ((tt.IsValue("СТУДЕНТ", null) || tt.IsValue("СТУДЕНТКА", null) || tt.IsValue("СЛУШАТЕЛЬ", null)) || tt.IsValue("ДИПЛОМНИК", null) || tt.IsValue("ИСПОЛНИТЕЛЬ", null)) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Worker; break; } tpt = Pullenti.Ner.Titlepage.Internal.TitleItemToken.TryAttach(tt); if (tpt != null && tpt.Typ != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Typ) { break; } } } if (persTyp != Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined) { pr.Add(p, persTyp, 1); } else { pr.Add(p, persTyp, (float)0.5); } if (t.EndChar > endToken.EndChar) { endToken = t; } } continue; } if (r == rli[0]) { persTyp = Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined; } if (r is Pullenti.Ner.Date.DateReferent) { if (res.Date == null) { res.Date = r as Pullenti.Ner.Date.DateReferent; if (t.EndChar > endToken.EndChar) { endToken = t; } } } else if (r is Pullenti.Ner.Geo.GeoReferent) { if (res.City == null && (r as Pullenti.Ner.Geo.GeoReferent).IsCity) { res.City = r as Pullenti.Ner.Geo.GeoReferent; if (t.EndChar > endToken.EndChar) { endToken = t; } } } if (r is Pullenti.Ner.Org.OrganizationReferent) { Pullenti.Ner.Org.OrganizationReferent org = r as Pullenti.Ner.Org.OrganizationReferent; if (org.Types.Contains("курс") && org.Number != null) { int i; if (int.TryParse(org.Number, out i)) { if (i > 0 && (i < 8)) { res.StudentYear = i; } } } for (; org.Higher != null; org = org.Higher) { if (org.Kind != Pullenti.Ner.Org.OrganizationKind.Department) { break; } } if (org.Kind != Pullenti.Ner.Org.OrganizationKind.Department) { if (res.Org == null) { res.Org = org; } else if (Pullenti.Ner.Org.OrganizationReferent.CanBeHigher(res.Org, org)) { res.Org = org; } } if (t.EndChar > endToken.EndChar) { endToken = t; } } if ((r is Pullenti.Ner.Uri.UriReferent) || (r is Pullenti.Ner.Geo.GeoReferent)) { if (t.EndChar > endToken.EndChar) { endToken = t; } } } } foreach (Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types ty in persTypes) { foreach (Pullenti.Ner.Person.PersonReferent p in pr.GetPersons(ty)) { if (pr.GetAttrNameForType(ty) != null) { res.AddSlot(pr.GetAttrNameForType(ty), p, false, 0); } } } if (res.GetSlotValue(TitlePageReferent.ATTR_AUTHOR) == null) { foreach (Pullenti.Ner.Person.PersonReferent p in pr.GetPersons(Pullenti.Ner.Titlepage.Internal.TitleItemToken.Types.Undefined)) { res.AddSlot(TitlePageReferent.ATTR_AUTHOR, p, false, 0); break; } } if (res.City == null && res.Org != null) { Pullenti.Ner.Slot s = res.Org.FindSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_GEO, null, true); if (s != null && (s.Value is Pullenti.Ner.Geo.GeoReferent)) { if ((s.Value as Pullenti.Ner.Geo.GeoReferent).IsCity) { res.City = s.Value as Pullenti.Ner.Geo.GeoReferent; } } } if (res.Date == null) { for (Pullenti.Ner.Token t = begin; t != null && t.EndChar <= endChar; t = t.Next) { Pullenti.Ner.Geo.GeoReferent city = t.GetReferent() as Pullenti.Ner.Geo.GeoReferent; if (city == null) { continue; } if (t.Next is Pullenti.Ner.TextToken) { if (t.Next.IsCharOf(":,") || t.Next.IsHiphen) { t = t.Next; } } Pullenti.Ner.ReferentToken rt = t.Kit.ProcessReferent(Pullenti.Ner.Date.DateAnalyzer.ANALYZER_NAME, t.Next); if (rt != null) { rt.SaveToLocalOntology(); res.Date = rt.Referent as Pullenti.Ner.Date.DateReferent; if (kit != null) { kit.EmbedToken(rt); } break; } } } if (res.Slots.Count == 0) { return(null); } else { return(res); } }