void _calcList() { Pullenti.Morph.MorphCase cas0 = FromMorph.Case; if (To == null) { if (ToVerb == null) { return; } return; } if (From.Source.Typ != To.Source.Typ) { if (From.Source.Prep == To.Source.Prep && ((From.Source.Typ == SentItemType.Noun || From.Source.Typ == SentItemType.PartBefore || From.Source.Typ == SentItemType.PartAfter)) && ((To.Source.Typ == SentItemType.Noun || To.Source.Typ == SentItemType.PartBefore || To.Source.Typ == SentItemType.PartAfter))) { } else { return; } } Pullenti.Morph.MorphCase cas1 = ToMorph.Case; if (!((cas0 & cas1)).IsUndefined) { Coef = Pullenti.Semantic.SemanticService.Params.List; if (string.IsNullOrEmpty(FromPrep) && !string.IsNullOrEmpty(To.Source.Prep)) { Coef /= 2; } else if (!string.IsNullOrEmpty(FromPrep) && string.IsNullOrEmpty(To.Source.Prep)) { Coef /= 4; } } else { if (!cas0.IsUndefined && !cas1.IsUndefined) { return; } if (!string.IsNullOrEmpty(FromPrep) && string.IsNullOrEmpty(To.Source.Prep)) { return; } Coef = Pullenti.Semantic.SemanticService.Params.List; } Pullenti.Ner.TextToken t1 = From.Source.EndToken as Pullenti.Ner.TextToken; Pullenti.Ner.TextToken t2 = To.Source.EndToken as Pullenti.Ner.TextToken; if (t1 != null && t2 != null) { if (t1.IsValue(t2.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false), null)) { Coef *= 10; } } if (From.Source.Typ != To.Source.Typ) { Coef /= 2; } }
/// <summary> /// Проверка, что с этого токена может начинаться последовательность, а сам токен является открывающей скобкой или кавычкой /// </summary> /// <param name="t">проверяемый токен</param> /// <param name="quotesOnly">должны быть именно кавычка, а не скобка</param> /// <return>да-нет</return> public static bool CanBeStartOfSequence(Pullenti.Ner.Token t, bool quotesOnly = false, bool ignoreWhitespaces = false) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null || tt.Next == null) { return(false); } char ch = tt.Term[0]; if (char.IsLetterOrDigit(ch)) { return(false); } if (quotesOnly && (m_Quotes.IndexOf(ch) < 0)) { return(false); } if (t.Next == null) { return(false); } if (m_OpenChars.IndexOf(ch) < 0) { return(false); } if (!ignoreWhitespaces) { if (t.IsWhitespaceAfter) { if (!t.IsWhitespaceBefore) { if (t.Previous != null && t.Previous.IsTableControlChar) { } else { return(false); } } if (t.IsNewlineAfter) { return(false); } } else if (!t.IsWhitespaceBefore) { if (char.IsLetterOrDigit(t.Kit.GetTextCharacter(t.BeginChar - 1))) { if (t.Next != null && ((t.Next.Chars.IsAllLower || !t.Next.Chars.IsLetter))) { if (ch != '(') { return(false); } } } } } return(true); }
/// <summary> /// Получить информацию о словоформе токена /// </summary> /// <param name="t">токен</param> /// <return>статистическая информация по тексту</return> public StatisticWordInfo GetWordInfo(Pullenti.Ner.Token t) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(null); } return(this.FindItem(tt, true)); }
public static Pullenti.Ner.Token CreateSex(Pullenti.Ner.Person.PersonReferent pr, Pullenti.Ner.Token t) { if (t == null) { return(null); } while (t.Next != null) { if (t.IsValue("ПОЛ", null) || t.IsHiphen || t.IsChar(':')) { t = t.Next; } else { break; } } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(null); } bool ok = false; if ((tt.Term == "МУЖ" || tt.Term == "МУЖС" || tt.Term == "МУЖСК") || tt.IsValue("МУЖСКОЙ", null)) { pr.IsMale = true; ok = true; } else if ((tt.Term == "ЖЕН" || tt.Term == "ЖЕНС" || tt.Term == "ЖЕНСК") || tt.IsValue("ЖЕНСКИЙ", null)) { pr.IsFemale = true; ok = true; } if (!ok) { return(null); } while (t.Next != null) { if (t.Next.IsValue("ПОЛ", null) || t.Next.IsChar('.')) { t = t.Next; } else { break; } } return(t); }
public static bool IsMultCharEnd(Pullenti.Ner.Token t) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(false); } string term = tt.Term; if (term.EndsWith("X") || term.EndsWith("Х")) { return(true); } return(false); }
public static bool IsMultChar(Pullenti.Ner.Token t) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(false); } if (tt.LengthChar == 1) { if (tt.IsCharOf("*xXхХ·×◦∙•")) { return(true); } } return(false); }
// Привязка с точностью до похожести // simD - параметр "похожесть (0.05..1)" public List <TerminToken> TryParseAllSim(Pullenti.Ner.Token token, double simD) { if (simD >= 1 || (simD < 0.05)) { return(this.TryParseAll(token, TerminParseAttr.No)); } if (Termins.Count == 0 || token == null) { return(null); } Pullenti.Ner.TextToken tt = token as Pullenti.Ner.TextToken; if (tt == null && (token is Pullenti.Ner.ReferentToken)) { tt = (token as Pullenti.Ner.ReferentToken).BeginToken as Pullenti.Ner.TextToken; } List <TerminToken> res = null; foreach (Termin t in Termins) { if (!t.Lang.IsUndefined) { if (!token.Morph.Language.IsUndefined) { if (((token.Morph.Language & t.Lang)).IsUndefined) { continue; } } } TerminToken ar = t.TryParseSim(tt, simD, TerminParseAttr.No); if (ar == null) { continue; } ar.Termin = t; if (res == null || ar.TokensCount > res[0].TokensCount) { res = new List <TerminToken>(); res.Add(ar); } else if (ar.TokensCount == res[0].TokensCount) { res.Add(ar); } } return(res); }
static bool _compareListItemTails(Pullenti.Ner.MetaToken mt1, Pullenti.Ner.MetaToken mt2) { Pullenti.Ner.TextToken t1 = mt1.EndToken as Pullenti.Ner.TextToken; Pullenti.Ner.TextToken t2 = mt2.EndToken as Pullenti.Ner.TextToken; if (t1 == null || t2 == null) { return(true); } int k = 0; int i1 = t1.Term.Length - 1; int i2 = t2.Term.Length - 1; for (; i1 > 0 && i2 > 0; i1--, i2--, k++) { if (t1.Term[i1] != t2.Term[i2]) { break; } } if (k >= 2) { return(true); } string nn = t2.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false); if (t1.IsValue(nn, null)) { return(true); } if (((t1.Morph.Number & t2.Morph.Number)) == Pullenti.Morph.MorphNumber.Undefined) { return(false); } if (((t1.Morph.Case & t2.Morph.Case)).IsUndefined) { return(false); } if (t1.Morph.Class.IsVerb != t2.Morph.Class.IsVerb && t1.Morph.Class.IsAdjective != t2.Morph.Class.IsAdjective) { return(false); } return(true); }
static Pullenti.Ner.Token DeserializeToken(Stream stream, Pullenti.Ner.Core.AnalysisKit kit, int vers) { short typ = DeserializeShort(stream); if (typ == 0) { return(null); } Pullenti.Ner.Token t = null; if (typ == 1) { t = new Pullenti.Ner.TextToken(null, kit); } else if (typ == 2) { t = new Pullenti.Ner.NumberToken(null, null, null, Pullenti.Ner.NumberSpellingType.Digit, kit); } else if (typ == 3) { t = new Pullenti.Ner.ReferentToken(null, null, null, kit); } else { t = new Pullenti.Ner.MetaToken(null, null, kit); } t.Deserialize(stream, kit, vers); if (t is Pullenti.Ner.MetaToken) { Pullenti.Ner.Token tt = DeserializeTokens(stream, kit, vers); if (tt != null) { (t as Pullenti.Ner.MetaToken).m_BeginToken = tt; for (; tt != null; tt = tt.Next) { (t as Pullenti.Ner.MetaToken).m_EndToken = tt; } } } return(t); }
void DefineBaseLanguage() { Dictionary <short, int> stat = new Dictionary <short, int>(); int total = 0; for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { continue; } if (tt.Morph.Language.IsUndefined) { continue; } if (!stat.ContainsKey(tt.Morph.Language.Value)) { stat.Add(tt.Morph.Language.Value, 1); } else { stat[tt.Morph.Language.Value]++; } total++; } short val = (short)0; foreach (KeyValuePair <short, int> kp in stat) { if (kp.Value > (total / 2)) { val |= kp.Key; } } BaseLanguage.Value = val; }
public void CorrectPrefix(Pullenti.Ner.TextToken t, bool ignoreGender) { if (t == null) { return; } foreach (Pullenti.Morph.MorphBaseInfo v in t.Morph.Items) { if (v.Class == Class && this.CheckAccord(v, ignoreGender, false)) { NormalValue = string.Format("{0}-{1}", (v as Pullenti.Morph.MorphWordForm).NormalCase, NormalValue); if (SingleNumberValue != null) { SingleNumberValue = string.Format("{0}-{1}", (v as Pullenti.Morph.MorphWordForm).NormalFull ?? (v as Pullenti.Morph.MorphWordForm).NormalCase, SingleNumberValue); } return; } } NormalValue = string.Format("{0}-{1}", t.Term, NormalValue); if (SingleNumberValue != null) { SingleNumberValue = string.Format("{0}-{1}", t.Term, SingleNumberValue); } }
static int CalcAbnormalCoef(Pullenti.Ner.Token t) { if (t is Pullenti.Ner.NumberToken) { return(0); } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(0); } if (!tt.Chars.IsLetter) { return(0); } if (!tt.Chars.IsLatinLetter && !tt.Chars.IsCyrillicLetter) { return(2); } if (tt.LengthChar < 4) { return(0); } foreach (Pullenti.Morph.MorphBaseInfo wf in tt.Morph.Items) { if ((wf as Pullenti.Morph.MorphWordForm).IsInDictionary) { return(-1); } } if (tt.LengthChar > 15) { return(2); } return(1); }
public static UnitToken TryParse(Pullenti.Ner.Token t, Pullenti.Ner.Core.TerminCollection addUnits, UnitToken prev, bool parseUnknownUnits = false) { if (t == null) { return(null); } Pullenti.Ner.Token t0 = t; int pow = 1; bool isNeg = false; if ((t.IsCharOf("\\/") || t.IsValue("НА", null) || t.IsValue("OF", null)) || t.IsValue("PER", null)) { isNeg = true; t = t.Next; } else if (t.IsValue("В", null) && prev != null) { isNeg = true; t = t.Next; } else if (MeasureHelper.IsMultChar(t)) { t = t.Next; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { return(null); } if (tt.Term == "КВ" || tt.Term == "КВАДР" || tt.IsValue("КВАДРАТНЫЙ", null)) { pow = 2; tt = tt.Next as Pullenti.Ner.TextToken; if (tt != null && tt.IsChar('.')) { tt = tt.Next as Pullenti.Ner.TextToken; } if (tt == null) { return(null); } } else if (tt.Term == "КУБ" || tt.Term == "КУБИЧ" || tt.IsValue("КУБИЧЕСКИЙ", null)) { pow = 3; tt = tt.Next as Pullenti.Ner.TextToken; if (tt != null && tt.IsChar('.')) { tt = tt.Next as Pullenti.Ner.TextToken; } if (tt == null) { return(null); } } else if (tt.Term == "µ") { UnitToken res = TryParse(tt.Next, addUnits, prev, false); if (res != null) { foreach (Unit u in UnitsHelper.Units) { if (u.Factor == UnitsFactors.Micro && string.Compare("мк" + u.NameCyr, res.Unit.NameCyr, true) == 0) { res.Unit = u; res.BeginToken = tt; res.Pow = pow; if (isNeg) { res.Pow = -pow; } return(res); } } } } List <Pullenti.Ner.Core.TerminToken> toks = UnitsHelper.Termins.TryParseAll(tt, Pullenti.Ner.Core.TerminParseAttr.No); if (toks != null) { if ((prev != null && tt == t0 && toks.Count == 1) && t.IsWhitespaceBefore) { return(null); } if (toks[0].BeginToken == toks[0].EndToken && tt.Morph.Class.IsPreposition && (tt.WhitespacesAfterCount < 3)) { if (Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt, Pullenti.Ner.Core.NounPhraseParseAttr.ParsePreposition, 0, null) != null) { return(null); } if (tt.Next is Pullenti.Ner.NumberToken) { if ((tt.Next as Pullenti.Ner.NumberToken).Typ != Pullenti.Ner.NumberSpellingType.Digit) { return(null); } } UnitToken nex = TryParse(tt.Next, addUnits, null, false); if (nex != null) { return(null); } } if (toks[0].BeginToken == toks[0].EndToken && ((toks[0].BeginToken.IsValue("М", null) || toks[0].BeginToken.IsValue("M", null))) && toks[0].BeginToken.Chars.IsAllLower) { if (prev != null && prev.Unit != null && prev.Unit.Kind == Pullenti.Ner.Measure.MeasureKind.Length) { UnitToken res = new UnitToken(t0, toks[0].EndToken) { Unit = UnitsHelper.uMinute }; res.Pow = pow; if (isNeg) { res.Pow = -pow; } return(res); } } List <UnitToken> uts = new List <UnitToken>(); foreach (Pullenti.Ner.Core.TerminToken tok in toks) { UnitToken res = new UnitToken(t0, tok.EndToken) { Unit = tok.Termin.Tag as Unit }; res.Pow = pow; if (isNeg) { res.Pow = -pow; } if (res.Unit.BaseMultiplier == 1000000 && (t0 is Pullenti.Ner.TextToken) && char.IsLower((t0 as Pullenti.Ner.TextToken).GetSourceText()[0])) { foreach (Unit u in UnitsHelper.Units) { if (u.Factor == UnitsFactors.Milli && string.Compare(u.NameCyr, res.Unit.NameCyr, true) == 0) { res.Unit = u; break; } } } res._correct(); res._checkDoubt(); uts.Add(res); } int max = 0; UnitToken best = null; foreach (UnitToken ut in uts) { if (ut.Keyword != null) { if (ut.Keyword.BeginChar >= max) { max = ut.Keyword.BeginChar; best = ut; } } } if (best != null) { return(best); } foreach (UnitToken ut in uts) { if (!ut.IsDoubt) { return(ut); } } return(uts[0]); } Pullenti.Ner.Token t1 = null; if (t.IsCharOf("º°")) { t1 = t; } else if ((t.IsChar('<') && t.Next != null && t.Next.Next != null) && t.Next.Next.IsChar('>') && ((t.Next.IsValue("О", null) || t.Next.IsValue("O", null) || (((t.Next is Pullenti.Ner.NumberToken) && (t.Next as Pullenti.Ner.NumberToken).Value == "0"))))) { t1 = t.Next.Next; } if (t1 != null) { UnitToken res = new UnitToken(t0, t1) { Unit = UnitsHelper.uGradus }; res._checkDoubt(); t = t1.Next; if (t != null && t.IsComma) { t = t.Next; } if (t != null && t.IsValue("ПО", null)) { t = t.Next; } if (t is Pullenti.Ner.TextToken) { string vv = (t as Pullenti.Ner.TextToken).Term; if (vv == "C" || vv == "С" || vv.StartsWith("ЦЕЛЬС")) { res.Unit = UnitsHelper.uGradusC; res.IsDoubt = false; res.EndToken = t; } if (vv == "F" || vv.StartsWith("ФАР")) { res.Unit = UnitsHelper.uGradusF; res.IsDoubt = false; res.EndToken = t; } } return(res); } if ((t is Pullenti.Ner.TextToken) && ((t.IsValue("ОС", null) || t.IsValue("OC", null)))) { string str = t.GetSourceText(); if (str == "оС" || str == "oC") { UnitToken res = new UnitToken(t, t) { Unit = UnitsHelper.uGradusC, IsDoubt = false }; return(res); } } if (t.IsChar('%')) { Pullenti.Ner.Token tt1 = t.Next; if (tt1 != null && tt1.IsChar('(')) { tt1 = tt1.Next; } if ((tt1 is Pullenti.Ner.TextToken) && (tt1 as Pullenti.Ner.TextToken).Term.StartsWith("ОБ")) { UnitToken re = new UnitToken(t, tt1) { Unit = UnitsHelper.uAlco }; if (re.EndToken.Next != null && re.EndToken.Next.IsChar('.')) { re.EndToken = re.EndToken.Next; } if (re.EndToken.Next != null && re.EndToken.Next.IsChar(')') && t.Next.IsChar('(')) { re.EndToken = re.EndToken.Next; } return(re); } return(new UnitToken(t, t) { Unit = UnitsHelper.uPercent }); } if (addUnits != null) { Pullenti.Ner.Core.TerminToken tok = addUnits.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (tok != null) { UnitToken res = new UnitToken(t0, tok.EndToken) { ExtOnto = tok.Termin.Tag as Pullenti.Ner.Measure.UnitReferent }; if (tok.EndToken.Next != null && tok.EndToken.Next.IsChar('.')) { tok.EndToken = tok.EndToken.Next; } res.Pow = pow; if (isNeg) { res.Pow = -pow; } res._correct(); return(res); } } if (!parseUnknownUnits) { return(null); } if ((t.WhitespacesBeforeCount > 2 || !t.Chars.IsLetter || t.LengthChar > 5) || !(t is Pullenti.Ner.TextToken)) { return(null); } if (Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(t)) { return(null); } t1 = t; if (t.Next != null && t.Next.IsChar('.')) { t1 = t; } bool ok = false; if (t1.Next == null || t1.WhitespacesAfterCount > 2) { ok = true; } else if (t1.Next.IsComma || t1.Next.IsCharOf("\\/") || t1.Next.IsTableControlChar) { ok = true; } else if (MeasureHelper.IsMultChar(t1.Next)) { ok = true; } if (!ok) { return(null); } Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsUndefined) { } else if (t.LengthChar > 7) { return(null); } UnitToken res1 = new UnitToken(t0, t1) { Pow = pow, IsDoubt = true }; res1.UnknownName = (t as Pullenti.Ner.TextToken).GetSourceText(); res1._correct(); return(res1); }
public static List <PersonItemToken> TryAttach(Pullenti.Ner.Token t) { List <PersonItemToken> res = new List <PersonItemToken>(); for (; t != null; t = t.Next) { if (t.IsNewlineBefore && res.Count > 0) { break; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } string s = tt.Term; if (!char.IsLetter(s[0])) { break; } if (((s.Length == 1 || s == "ДЖ")) && !tt.Chars.IsAllLower) { Pullenti.Ner.Token t1 = t; if (t1.Next != null && t1.Next.IsChar('.')) { t1 = t1.Next; } res.Add(new PersonItemToken(t, t1) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.Initial, Value = s }); t = t1; continue; } if (tt.IsAnd) { res.Add(new PersonItemToken(t, t) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.And }); continue; } if (tt.Morph.Class.IsPronoun || tt.Morph.Class.IsPersonalPronoun) { break; } if (tt.Chars.IsAllLower) { Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if (mc.IsPreposition || mc.IsVerb || mc.IsAdverb) { break; } Pullenti.Ner.Token t1 = t; if (t1.Next != null && !t1.IsWhitespaceAfter && t1.Next.IsChar('.')) { t1 = t1.Next; } res.Add(new PersonItemToken(t, t1) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.LocaseWord, Value = s }); t = t1; continue; } if (tt.Morph.Class.IsProperName) { res.Add(new PersonItemToken(t, t) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.Name, Value = s }); } else if ((t.Next != null && t.Next.IsHiphen && (t.Next.Next is Pullenti.Ner.TextToken)) && !t.Next.IsWhitespaceAfter) { res.Add(new PersonItemToken(t, t.Next.Next) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.Surname, Value = string.Format("{0}-{1}", s, (t.Next.Next as Pullenti.Ner.TextToken).Term) }); t = t.Next.Next; } else { res.Add(new PersonItemToken(t, t) { Typ = Pullenti.Ner.Org.Internal.OrgItemEponymToken.PersonItemType.Surname, Value = s }); } } return(res.Count > 0 ? res : null); }
public static OrgItemEponymToken TryAttach(Pullenti.Ner.Token t, bool mustHasPrefix = false) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { if (t == null) { return(null); } Pullenti.Ner.Referent r1 = t.GetReferent(); if (r1 != null && r1.TypeName == "DATE") { string str = r1.ToString().ToUpper(); if ((str == "1 МАЯ" || str == "7 ОКТЯБРЯ" || str == "9 МАЯ") || str == "8 МАРТА") { OrgItemEponymToken dt = new OrgItemEponymToken(t, t) { Eponyms = new List <string>() }; dt.Eponyms.Add(str); return(dt); } } Pullenti.Ner.NumberToken age = Pullenti.Ner.Core.NumberHelper.TryParseAge(t); if ((age != null && (((age.EndToken.Next is Pullenti.Ner.TextToken) || (age.EndToken.Next is Pullenti.Ner.ReferentToken))) && (age.WhitespacesAfterCount < 3)) && !age.EndToken.Next.Chars.IsAllLower && age.EndToken.Next.Chars.IsCyrillicLetter) { OrgItemEponymToken dt = new OrgItemEponymToken(t, age.EndToken.Next) { Eponyms = new List <string>() }; dt.Eponyms.Add(string.Format("{0} {1}", age.Value, dt.EndToken.GetSourceText().ToUpper())); return(dt); } return(null); } Pullenti.Ner.Token t1 = null; bool full = false; bool hasName = false; if (tt.Term == "ИМЕНИ" || tt.Term == "ІМЕНІ") { t1 = t.Next; full = true; hasName = true; } else if (((tt.Term == "ИМ" || tt.Term == "ІМ")) && tt.Next != null) { if (tt.Next.IsChar('.')) { t1 = tt.Next.Next; full = true; } else if ((tt.Next is Pullenti.Ner.TextToken) && tt.Chars.IsAllLower && !tt.Next.Chars.IsAllLower) { t1 = tt.Next; } hasName = true; } else if (tt.Previous != null && ((tt.Previous.IsValue("ФОНД", null) || tt.Previous.IsValue("ХРАМ", null) || tt.Previous.IsValue("ЦЕРКОВЬ", "ЦЕРКВА")))) { if ((!tt.Chars.IsCyrillicLetter || tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction) || !tt.Chars.IsLetter) { return(null); } if (tt.WhitespacesBeforeCount != 1) { return(null); } if (tt.Chars.IsAllLower) { return(null); } if (tt.Morph.Class.IsAdjective) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.BeginToken != npt.EndToken) { return(null); } } OrgItemNameToken na = OrgItemNameToken.TryAttach(tt, null, false, true); if (na != null) { if (na.IsEmptyWord || na.IsStdName || na.IsStdTail) { return(null); } } t1 = tt; } if (t1 == null || ((t1.IsNewlineBefore && !full))) { return(null); } if (tt.Previous != null && tt.Previous.Morph.Class.IsPreposition) { return(null); } if (mustHasPrefix && !hasName) { return(null); } Pullenti.Ner.Referent r = t1.GetReferent(); if ((r != null && r.TypeName == "DATE" && full) && r.FindSlot("DAY", null, true) != null && r.FindSlot("YEAR", null, true) == null) { OrgItemEponymToken dt = new OrgItemEponymToken(t, t1) { Eponyms = new List <string>() }; dt.Eponyms.Add(r.ToString().ToUpper()); return(dt); } bool holy = false; if ((t1.IsValue("СВЯТОЙ", null) || t1.IsValue("СВЯТИЙ", null) || t1.IsValue("СВ", null)) || t1.IsValue("СВЯТ", null)) { t1 = t1.Next; holy = true; if (t1 != null && t1.IsChar('.')) { t1 = t1.Next; } } if (t1 == null) { return(null); } Pullenti.Morph.MorphClass cl = t1.GetMorphClassInDictionary(); if (cl.IsNoun || cl.IsAdjective) { Pullenti.Ner.ReferentToken rt = t1.Kit.ProcessReferent("PERSON", t1); if (rt != null && rt.Referent.TypeName == "PERSON" && rt.BeginToken != rt.EndToken) { string e = rt.Referent.GetStringValue("LASTNAME"); if (e != null) { if (rt.EndToken.IsValue(e, null)) { OrgItemEponymToken re = new OrgItemEponymToken(t, rt.EndToken); re.Eponyms.Add(rt.EndToken.GetSourceText()); return(re); } } } } Pullenti.Ner.NumberToken nt = Pullenti.Ner.Core.NumberHelper.TryParseAnniversary(t1); if (nt != null && nt.Typ == Pullenti.Ner.NumberSpellingType.Age) { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(nt.EndToken.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null) { string s = string.Format("{0}-{1} {2}", nt.Value, (t.Kit.BaseLanguage.IsUa ? "РОКІВ" : "ЛЕТ"), Pullenti.Ner.Core.MiscHelper.GetTextValue(npt.BeginToken, npt.EndToken, Pullenti.Ner.Core.GetTextAttr.No)); OrgItemEponymToken res = new OrgItemEponymToken(t, npt.EndToken); res.Eponyms.Add(s); return(res); } } List <PersonItemToken> its = PersonItemToken.TryAttach(t1); if (its == null) { if ((t1 is Pullenti.Ner.ReferentToken) && (t1.GetReferent() is Pullenti.Ner.Geo.GeoReferent)) { string s = Pullenti.Ner.Core.MiscHelper.GetTextValue(t1, t1, Pullenti.Ner.Core.GetTextAttr.No); OrgItemEponymToken re = new OrgItemEponymToken(t, t1); re.Eponyms.Add(s); return(re); } return(null); } List <string> eponims = new List <string>(); int i = 0; int j; if (its[i].Typ == PersonItemType.LocaseWord) { i++; } if (i >= its.Count) { return(null); } if (!full) { if (its[i].BeginToken.Morph.Class.IsAdjective && !its[i].BeginToken.Morph.Class.IsProperSurname) { return(null); } } if (its[i].Typ == PersonItemType.Initial) { i++; while (true) { if ((i < its.Count) && its[i].Typ == PersonItemType.Initial) { i++; } if (i >= its.Count || ((its[i].Typ != PersonItemType.Surname && its[i].Typ != PersonItemType.Name))) { break; } eponims.Add(its[i].Value); t1 = its[i].EndToken; if ((i + 2) >= its.Count || its[i + 1].Typ != PersonItemType.And || its[i + 2].Typ != PersonItemType.Initial) { break; } i += 3; } } else if (((i + 1) < its.Count) && its[i].Typ == PersonItemType.Name && its[i + 1].Typ == PersonItemType.Surname) { eponims.Add(its[i + 1].Value); t1 = its[i + 1].EndToken; i += 2; if ((((i + 2) < its.Count) && its[i].Typ == PersonItemType.And && its[i + 1].Typ == PersonItemType.Name) && its[i + 2].Typ == PersonItemType.Surname) { eponims.Add(its[i + 2].Value); t1 = its[i + 2].EndToken; } } else if (its[i].Typ == PersonItemType.Surname) { if (its.Count == (i + 2) && its[i].Chars == its[i + 1].Chars) { its[i].Value += (" " + its[i + 1].Value); its[i].EndToken = its[i + 1].EndToken; its.RemoveAt(i + 1); } eponims.Add(its[i].Value); if (((i + 1) < its.Count) && its[i + 1].Typ == PersonItemType.Name) { if ((i + 2) == its.Count) { i++; } else if (its[i + 2].Typ != PersonItemType.Surname) { i++; } } else if (((i + 1) < its.Count) && its[i + 1].Typ == PersonItemType.Initial) { if ((i + 2) == its.Count) { i++; } else if (its[i + 2].Typ == PersonItemType.Initial && (i + 3) == its.Count) { i += 2; } } else if (((i + 2) < its.Count) && its[i + 1].Typ == PersonItemType.And && its[i + 2].Typ == PersonItemType.Surname) { bool ok = true; Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(its[i + 2].BeginToken, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && !npt.Morph.Case.IsGenitive && !npt.Morph.Case.IsUndefined) { ok = false; } if (ok) { eponims.Add(its[i + 2].Value); i += 2; } } t1 = its[i].EndToken; } else if (its[i].Typ == PersonItemType.Name && holy) { t1 = its[i].EndToken; bool sec = false; if (((i + 1) < its.Count) && its[i].Chars == its[i + 1].Chars && its[i + 1].Typ != PersonItemType.Initial) { sec = true; t1 = its[i + 1].EndToken; } if (sec) { eponims.Add(string.Format("СВЯТ.{0} {1}", its[i].Value, its[i + 1].Value)); } else { eponims.Add(string.Format("СВЯТ.{0}", its[i].Value)); } } else if (full && (i + 1) == its.Count && ((its[i].Typ == PersonItemType.Name || its[i].Typ == PersonItemType.Surname))) { t1 = its[i].EndToken; eponims.Add(its[i].Value); } else if ((its[i].Typ == PersonItemType.Name && its.Count == 3 && its[i + 1].Typ == PersonItemType.Name) && its[i + 2].Typ == PersonItemType.Surname) { t1 = its[i + 2].EndToken; eponims.Add(string.Format("{0} {1} {2}", its[i].Value, its[i + 1].Value, its[i + 2].Value)); i += 2; } if (eponims.Count == 0) { return(null); } return(new OrgItemEponymToken(t, t1) { Eponyms = eponims }); }
void CorrectWordsByMerging(Pullenti.Morph.MorphLang lang) { for (Pullenti.Ner.Token t = FirstToken; t != null && t.Next != null; t = t.Next) { if (!t.Chars.IsLetter || (t.LengthChar < 2)) { continue; } Pullenti.Morph.MorphClass mc0 = t.GetMorphClassInDictionary(); if (t.Morph.ContainsAttr("прдктв.", null)) { continue; } Pullenti.Ner.Token t1 = t.Next; if (t1.IsHiphen && t1.Next != null && !t1.IsNewlineAfter) { t1 = t1.Next; } if (t1.LengthChar == 1) { continue; } if (!t1.Chars.IsLetter || !t.Chars.IsLetter || t1.Chars.IsLatinLetter != t.Chars.IsLatinLetter) { continue; } if (t1.Chars.IsAllUpper && !t.Chars.IsAllUpper) { continue; } else if (!t1.Chars.IsAllLower) { continue; } else if (t.Chars.IsAllUpper) { continue; } if (t1.Morph.ContainsAttr("прдктв.", null)) { continue; } Pullenti.Morph.MorphClass mc1 = t1.GetMorphClassInDictionary(); if (!mc1.IsUndefined && !mc0.IsUndefined) { continue; } if (((t as Pullenti.Ner.TextToken).Term.Length + (t1 as Pullenti.Ner.TextToken).Term.Length) < 6) { continue; } string corw = (t as Pullenti.Ner.TextToken).Term + (t1 as Pullenti.Ner.TextToken).Term; List <Pullenti.Morph.MorphToken> ccc = Pullenti.Morph.MorphologyService.Process(corw, lang, null); if (ccc == null || ccc.Count != 1) { continue; } if (corw == "ПОСТ" || corw == "ВРЕД") { continue; } Pullenti.Ner.TextToken tt = new Pullenti.Ner.TextToken(ccc[0], this, t.BeginChar, t1.EndChar); if (tt.GetMorphClassInDictionary().IsUndefined) { continue; } tt.Chars = t.Chars; if (t == FirstToken) { FirstToken = tt; } else { t.Previous.Next = tt; } if (t1.Next != null) { tt.Next = t1.Next; } t = tt; } }
public static UriItemToken AttachBBK(Pullenti.Ner.Token t0) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; int digs = 0; for (Pullenti.Ner.Token t = t0; t != null; t = t.Next) { if (t.IsNewlineBefore && t != t0) { break; } if (t.IsTableControlChar) { break; } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if (nt.Typ != Pullenti.Ner.NumberSpellingType.Digit || !nt.Morph.Class.IsUndefined) { break; } string d = nt.GetSourceText(); txt.Append(d); digs += d.Length; t1 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } if (tt.IsChar(',')) { break; } if (tt.IsChar('(')) { if (!(tt.Next is Pullenti.Ner.NumberToken)) { break; } } string s = tt.GetSourceText(); if (char.IsLetter(s[0])) { if (tt.IsWhitespaceBefore) { break; } } txt.Append(s); t1 = t; } if ((txt.Length < 3) || (digs < 2)) { return(null); } if (txt[txt.Length - 1] == '.') { txt.Length--; t1 = t1.Previous; } return(new UriItemToken(t0, t1) { Value = txt.ToString() }); }
public static List <UriItemToken> AttachMailUsers(Pullenti.Ner.Token t1) { if (t1 == null) { return(null); } if (t1.IsChar('}')) { List <UriItemToken> res0 = AttachMailUsers(t1.Previous); if (res0 == null) { return(null); } t1 = res0[0].BeginToken.Previous; for (; t1 != null; t1 = t1.Previous) { if (t1.IsChar('{')) { res0[0].BeginToken = t1; return(res0); } if (t1.IsCharOf(";,")) { continue; } List <UriItemToken> res1 = AttachMailUsers(t1); if (res1 == null) { return(null); } res0.Insert(0, res1[0]); t1 = res1[0].BeginToken; } return(null); } StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t0 = t1; for (Pullenti.Ner.Token t = t1; t != null; t = t.Previous) { if (t.IsWhitespaceAfter) { break; } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; txt.Insert(0, nt.GetSourceText()); t0 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } string src = tt.GetSourceText(); char ch = src[0]; if (!char.IsLetter(ch)) { if (".-_".IndexOf(ch) < 0) { break; } } txt.Insert(0, src); t0 = t; } if (txt.Length == 0) { return(null); } List <UriItemToken> res = new List <UriItemToken>(); res.Add(new UriItemToken(t0, t1) { Value = txt.ToString().ToLower() }); return(res); }
public static Pullenti.Ner.Core.NumberExToken TryParseNumberWithPostfix(Pullenti.Ner.Token t) { if (t == null) { return(null); } Pullenti.Ner.Token t0 = t; string isDollar = null; if (t.LengthChar == 1 && t.Next != null) { if ((((isDollar = Pullenti.Ner.Core.NumberHelper.IsMoneyChar(t)))) != null) { t = t.Next; } } Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if (nt == null) { if ((!(t.Previous is Pullenti.Ner.NumberToken) && t.IsChar('(') && (t.Next is Pullenti.Ner.NumberToken)) && t.Next.Next != null && t.Next.Next.IsChar(')')) { Pullenti.Ner.Core.TerminToken toks1 = m_Postfixes.TryParse(t.Next.Next.Next, Pullenti.Ner.Core.TerminParseAttr.No); if (toks1 != null && ((Pullenti.Ner.Core.NumberExType)toks1.Termin.Tag) == Pullenti.Ner.Core.NumberExType.Money) { Pullenti.Ner.NumberToken nt0 = t.Next as Pullenti.Ner.NumberToken; Pullenti.Ner.Core.NumberExToken res = new Pullenti.Ner.Core.NumberExToken(t, toks1.EndToken, nt0.Value, nt0.Typ, Pullenti.Ner.Core.NumberExType.Money) { AltRealValue = nt0.RealValue, Morph = toks1.BeginToken.Morph }; return(_correctMoney(res, toks1.BeginToken)); } } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null || !tt.Morph.Class.IsAdjective) { return(null); } string val = tt.Term; for (int i = 4; i < (val.Length - 5); i++) { string v = val.Substring(0, i); List <Pullenti.Ner.Core.Termin> li = Pullenti.Ner.Core.NumberHelper.m_Nums.FindTerminsByString(v, tt.Morph.Language); if (li == null) { continue; } string vv = val.Substring(i); List <Pullenti.Ner.Core.Termin> lii = m_Postfixes.FindTerminsByString(vv, tt.Morph.Language); if (lii != null && lii.Count > 0) { Pullenti.Ner.Core.NumberExToken re = new Pullenti.Ner.Core.NumberExToken(t, t, ((int)li[0].Tag).ToString(), Pullenti.Ner.NumberSpellingType.Words, (Pullenti.Ner.Core.NumberExType)lii[0].Tag) { Morph = t.Morph }; _correctExtTypes(re); return(re); } break; } return(null); } if (t.Next == null && isDollar == null) { return(null); } double f = nt.RealValue; if (double.IsNaN(f)) { return(null); } Pullenti.Ner.Token t1 = nt.Next; if (((t1 != null && t1.IsCharOf(",."))) || (((t1 is Pullenti.Ner.NumberToken) && (t1.WhitespacesBeforeCount < 3)))) { double d; Pullenti.Ner.NumberToken tt11 = Pullenti.Ner.Core.NumberHelper.TryParseRealNumber(nt, false, false); if (tt11 != null) { t1 = tt11.EndToken.Next; f = tt11.RealValue; } } if (t1 == null) { if (isDollar == null) { return(null); } } else if ((t1.Next != null && t1.Next.IsValue("С", "З") && t1.Next.Next != null) && t1.Next.Next.IsValue("ПОЛОВИНА", null)) { f += 0.5; t1 = t1.Next.Next; } if (t1 != null && t1.IsHiphen && t1.Next != null) { t1 = t1.Next; } bool det = false; double altf = f; if (((t1 is Pullenti.Ner.NumberToken) && t1.Previous != null && t1.Previous.IsHiphen) && (t1 as Pullenti.Ner.NumberToken).IntValue == 0 && t1.LengthChar == 2) { t1 = t1.Next; } if ((t1 != null && t1.Next != null && t1.IsChar('(')) && (((t1.Next is Pullenti.Ner.NumberToken) || t1.Next.IsValue("НОЛЬ", null))) && t1.Next.Next != null) { Pullenti.Ner.NumberToken nt1 = t1.Next as Pullenti.Ner.NumberToken; double val = (double)0; if (nt1 != null) { val = nt1.RealValue; } if (Math.Floor(f) == Math.Floor(val)) { Pullenti.Ner.Token ttt = t1.Next.Next; if (ttt.IsChar(')')) { t1 = ttt.Next; det = true; if ((t1 is Pullenti.Ner.NumberToken) && (t1 as Pullenti.Ner.NumberToken).IntValue != null && (t1 as Pullenti.Ner.NumberToken).IntValue.Value == 0) { t1 = t1.Next; } } else if (((((ttt is Pullenti.Ner.NumberToken) && ((ttt as Pullenti.Ner.NumberToken).RealValue < 100) && ttt.Next != null) && ttt.Next.IsChar('/') && ttt.Next.Next != null) && ttt.Next.Next.GetSourceText() == "100" && ttt.Next.Next.Next != null) && ttt.Next.Next.Next.IsChar(')')) { int rest = GetDecimalRest100(f); if ((ttt as Pullenti.Ner.NumberToken).IntValue != null && rest == (ttt as Pullenti.Ner.NumberToken).IntValue.Value) { t1 = ttt.Next.Next.Next.Next; det = true; } } else if ((ttt.IsValue("ЦЕЛЫХ", null) && (ttt.Next is Pullenti.Ner.NumberToken) && ttt.Next.Next != null) && ttt.Next.Next.Next != null && ttt.Next.Next.Next.IsChar(')')) { Pullenti.Ner.NumberToken num2 = ttt.Next as Pullenti.Ner.NumberToken; altf = num2.RealValue; if (ttt.Next.Next.IsValue("ДЕСЯТЫЙ", null)) { altf /= 10; } else if (ttt.Next.Next.IsValue("СОТЫЙ", null)) { altf /= 100; } else if (ttt.Next.Next.IsValue("ТЫСЯЧНЫЙ", null)) { altf /= 1000; } else if (ttt.Next.Next.IsValue("ДЕСЯТИТЫСЯЧНЫЙ", null)) { altf /= 10000; } else if (ttt.Next.Next.IsValue("СТОТЫСЯЧНЫЙ", null)) { altf /= 100000; } else if (ttt.Next.Next.IsValue("МИЛЛИОННЫЙ", null)) { altf /= 1000000; } if (altf < 1) { altf += val; t1 = ttt.Next.Next.Next.Next; det = true; } } else { Pullenti.Ner.Core.TerminToken toks1 = m_Postfixes.TryParse(ttt, Pullenti.Ner.Core.TerminParseAttr.No); if (toks1 != null) { if (((Pullenti.Ner.Core.NumberExType)toks1.Termin.Tag) == Pullenti.Ner.Core.NumberExType.Money) { if (toks1.EndToken.Next != null && toks1.EndToken.Next.IsChar(')')) { Pullenti.Ner.Core.NumberExToken res = new Pullenti.Ner.Core.NumberExToken(t, toks1.EndToken.Next, nt.Value, nt.Typ, Pullenti.Ner.Core.NumberExType.Money) { RealValue = f, AltRealValue = altf, Morph = toks1.BeginToken.Morph }; return(_correctMoney(res, toks1.BeginToken)); } } } Pullenti.Ner.Core.NumberExToken res2 = TryParseNumberWithPostfix(t1.Next); if (res2 != null && res2.EndToken.Next != null && res2.EndToken.Next.IsChar(')')) { res2.BeginToken = t; res2.EndToken = res2.EndToken.Next; res2.AltRealValue = res2.RealValue; res2.RealValue = f; _correctExtTypes(res2); if (res2.WhitespacesAfterCount < 2) { Pullenti.Ner.Core.TerminToken toks2 = m_Postfixes.TryParse(res2.EndToken.Next, Pullenti.Ner.Core.TerminParseAttr.No); if (toks2 != null) { if (((Pullenti.Ner.Core.NumberExType)toks2.Termin.Tag) == Pullenti.Ner.Core.NumberExType.Money) { res2.EndToken = toks2.EndToken; } } } return(res2); } } } else if (nt1 != null && nt1.Typ == Pullenti.Ner.NumberSpellingType.Words && nt.Typ == Pullenti.Ner.NumberSpellingType.Digit) { altf = nt1.RealValue; Pullenti.Ner.Token ttt = t1.Next.Next; if (ttt.IsChar(')')) { t1 = ttt.Next; det = true; } if (!det) { altf = f; } } } if ((t1 != null && t1.IsChar('(') && t1.Next != null) && t1.Next.IsValue("СУММА", null)) { Pullenti.Ner.Core.BracketSequenceToken br = Pullenti.Ner.Core.BracketHelper.TryParse(t1, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (br != null) { t1 = br.EndToken.Next; } } if (isDollar != null) { Pullenti.Ner.Token te = null; if (t1 != null) { te = t1.Previous; } else { for (t1 = t0; t1 != null; t1 = t1.Next) { if (t1.Next == null) { te = t1; } } } if (te == null) { return(null); } if (te.IsHiphen && te.Next != null) { if (te.Next.IsValue("МИЛЛИОННЫЙ", null)) { f *= 1000000; altf *= 1000000; te = te.Next; } else if (te.Next.IsValue("МИЛЛИАРДНЫЙ", null)) { f *= 1000000000; altf *= 1000000000; te = te.Next; } } if (!te.IsWhitespaceAfter && (te.Next is Pullenti.Ner.TextToken)) { if (te.Next.IsValue("M", null)) { f *= 1000000; altf *= 1000000; te = te.Next; } else if (te.Next.IsValue("BN", null)) { f *= 1000000000; altf *= 1000000000; te = te.Next; } } return(new Pullenti.Ner.Core.NumberExToken(t0, te, "", nt.Typ, Pullenti.Ner.Core.NumberExType.Money) { RealValue = f, AltRealValue = altf, ExTypParam = isDollar }); } if (t1 == null || ((t1.IsNewlineBefore && !det))) { return(null); } Pullenti.Ner.Core.TerminToken toks = m_Postfixes.TryParse(t1, Pullenti.Ner.Core.TerminParseAttr.No); if ((toks == null && det && (t1 is Pullenti.Ner.NumberToken)) && (t1 as Pullenti.Ner.NumberToken).Value == "0") { toks = m_Postfixes.TryParse(t1.Next, Pullenti.Ner.Core.TerminParseAttr.No); } if (toks == null && t1.IsChar('р')) { int cou = 10; for (Pullenti.Ner.Token ttt = t0.Previous; ttt != null && cou > 0; ttt = ttt.Previous, cou--) { if (ttt.IsValue("СУММА", null) || ttt.IsValue("НАЛИЧНЫЙ", null) || ttt.IsValue("БАЛАНС", null)) { } else if (ttt.GetReferent() != null && ttt.GetReferent().TypeName == "MONEY") { } else { continue; } toks = new Pullenti.Ner.Core.TerminToken(t1, t1) { Termin = m_Postfixes.FindTerminsByCanonicText("RUB")[0] }; if (t1.Next != null && t1.Next.IsChar('.')) { toks.EndToken = t1.Next; } Pullenti.Ner.Core.NumberExType ty = (Pullenti.Ner.Core.NumberExType)toks.Termin.Tag; return(new Pullenti.Ner.Core.NumberExToken(t, toks.EndToken, nt.Value, nt.Typ, ty) { RealValue = f, AltRealValue = altf, Morph = toks.BeginToken.Morph, ExTypParam = "RUB" }); } } if (toks != null) { t1 = toks.EndToken; if (!t1.IsChar('.') && t1.Next != null && t1.Next.IsChar('.')) { if ((t1 is Pullenti.Ner.TextToken) && t1.IsValue(toks.Termin.Terms[0].CanonicalText, null)) { } else if (!t1.Chars.IsLetter) { } else { t1 = t1.Next; } } if (toks.Termin.CanonicText == "LTL") { return(null); } if (toks.BeginToken == t1) { if (t1.Morph.Class.IsPreposition || t1.Morph.Class.IsConjunction) { if (t1.IsWhitespaceBefore && t1.IsWhitespaceAfter) { return(null); } } } Pullenti.Ner.Core.NumberExType ty = (Pullenti.Ner.Core.NumberExType)toks.Termin.Tag; Pullenti.Ner.Core.NumberExToken res = new Pullenti.Ner.Core.NumberExToken(t, t1, nt.Value, nt.Typ, ty) { RealValue = f, AltRealValue = altf, Morph = toks.BeginToken.Morph }; if (ty != Pullenti.Ner.Core.NumberExType.Money) { _correctExtTypes(res); return(res); } return(_correctMoney(res, toks.BeginToken)); } Pullenti.Ner.Core.NumberExToken pfx = _attachSpecPostfix(t1); if (pfx != null) { pfx.BeginToken = t; pfx.Value = nt.Value; pfx.Typ = nt.Typ; pfx.RealValue = f; pfx.AltRealValue = altf; return(pfx); } if (t1.Next != null && ((t1.Morph.Class.IsPreposition || t1.Morph.Class.IsConjunction))) { if (t1.IsValue("НА", null)) { } else { Pullenti.Ner.Core.NumberExToken nn = TryParseNumberWithPostfix(t1.Next); if (nn != null) { return new Pullenti.Ner.Core.NumberExToken(t, t, nt.Value, nt.Typ, nn.ExTyp) { RealValue = f, AltRealValue = altf, ExTyp2 = nn.ExTyp2, ExTypParam = nn.ExTypParam } } ; } } if (!t1.IsWhitespaceAfter && (t1.Next is Pullenti.Ner.NumberToken) && (t1 is Pullenti.Ner.TextToken)) { string term = (t1 as Pullenti.Ner.TextToken).Term; Pullenti.Ner.Core.NumberExType ty = Pullenti.Ner.Core.NumberExType.Undefined; if (term == "СМХ" || term == "CMX") { ty = Pullenti.Ner.Core.NumberExType.Santimeter; } else if (term == "MX" || term == "МХ") { ty = Pullenti.Ner.Core.NumberExType.Meter; } else if (term == "MMX" || term == "ММХ") { ty = Pullenti.Ner.Core.NumberExType.Millimeter; } if (ty != Pullenti.Ner.Core.NumberExType.Undefined) { return new Pullenti.Ner.Core.NumberExToken(t, t1, nt.Value, nt.Typ, ty) { RealValue = f, AltRealValue = altf, MultAfter = true } } ; } return(null); }
public AnalysisKit(Pullenti.Ner.SourceOfAnalysis sofa = null, bool onlyTokenizing = false, Pullenti.Morph.MorphLang lang = null, ProgressChangedEventHandler progress = null) { if (sofa == null) { return; } m_Sofa = sofa; StartDate = DateTime.Now; List <Pullenti.Morph.MorphToken> tokens = Pullenti.Morph.MorphologyService.Process(sofa.Text, lang, progress); Pullenti.Ner.Token t0 = null; if (tokens != null) { for (int ii = 0; ii < tokens.Count; ii++) { Pullenti.Morph.MorphToken mt = tokens[ii]; if (mt.BeginChar == 733860) { } Pullenti.Ner.TextToken tt = new Pullenti.Ner.TextToken(mt, this); if (sofa.CorrectionDict != null) { string corw; if (sofa.CorrectionDict.TryGetValue(mt.Term, out corw)) { List <Pullenti.Morph.MorphToken> ccc = Pullenti.Morph.MorphologyService.Process(corw, lang, null); if (ccc != null && ccc.Count == 1) { Pullenti.Ner.TextToken tt1 = new Pullenti.Ner.TextToken(ccc[0], this, tt.BeginChar, tt.EndChar) { Term0 = tt.Term }; tt1.Chars = tt.Chars; tt = tt1; if (CorrectedTokens == null) { CorrectedTokens = new Dictionary <Pullenti.Ner.Token, string>(); } CorrectedTokens.Add(tt, tt.GetSourceText()); } } } if (t0 == null) { FirstToken = tt; } else { t0.Next = tt; } t0 = tt; } } if (sofa.ClearDust) { this.ClearDust(); } if (sofa.DoWordsMergingByMorph) { this.CorrectWordsByMerging(lang); } if (sofa.DoWordCorrectionByMorph) { this.CorrectWordsByMorph(lang); } this.MergeLetters(); this.DefineBaseLanguage(); if (sofa.CreateNumberTokens) { for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { Pullenti.Ner.NumberToken nt = NumberHelper.TryParseNumber(t); if (nt == null) { continue; } this.EmbedToken(nt); t = nt; } } if (onlyTokenizing) { return; } for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { if (t.Morph.Class.IsPreposition) { continue; } Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsUndefined && t.Chars.IsCyrillicLetter && t.LengthChar > 4) { string tail = sofa.Text.Substring(t.EndChar - 1, 2); Pullenti.Ner.Token tte = null; Pullenti.Ner.Token tt = t.Previous; if (tt != null && ((tt.IsCommaAnd || tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction))) { tt = tt.Previous; } if ((tt != null && !tt.GetMorphClassInDictionary().IsUndefined&& ((tt.Morph.Class.Value & t.Morph.Class.Value)) != 0) && tt.LengthChar > 4) { string tail2 = sofa.Text.Substring(tt.EndChar - 1, 2); if (tail2 == tail) { tte = tt; } } if (tte == null) { tt = t.Next; if (tt != null && ((tt.IsCommaAnd || tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction))) { tt = tt.Next; } if ((tt != null && !tt.GetMorphClassInDictionary().IsUndefined&& ((tt.Morph.Class.Value & t.Morph.Class.Value)) != 0) && tt.LengthChar > 4) { string tail2 = sofa.Text.Substring(tt.EndChar - 1, 2); if (tail2 == tail) { tte = tt; } } } if (tte != null) { t.Morph.RemoveItemsEx(tte.Morph, tte.GetMorphClassInDictionary()); } } continue; } this.CreateStatistics(); }
bool CalcRankAndValue(int minNewlinesCount) { Rank = 0; if (BeginToken.Chars.IsAllLower) { Rank -= 30; } int words = 0; int upWords = 0; int notwords = 0; int lineNumber = 0; Pullenti.Ner.Token tstart = BeginToken; Pullenti.Ner.Token tend = EndToken; for (Pullenti.Ner.Token t = BeginToken; t != EndToken.Next && t != null && t.EndChar <= EndToken.EndChar; t = t.Next) { if (t.IsNewlineBefore) { } TitleItemToken tit = TitleItemToken.TryAttach(t); if (tit != null) { if (tit.Typ == TitleItemToken.Types.Theme || tit.Typ == TitleItemToken.Types.TypAndTheme) { if (t != BeginToken) { if (lineNumber > 0) { return(false); } words = (upWords = (notwords = 0)); tstart = tit.EndToken.Next; } t = tit.EndToken; if (t.Next == null) { return(false); } if (t.Next.Chars.IsLetter && t.Next.Chars.IsAllLower) { Rank += 20; } else { Rank += 100; } tstart = t.Next; if (tit.Typ == TitleItemToken.Types.TypAndTheme) { TypeValue = tit.Value; } continue; } if (tit.Typ == TitleItemToken.Types.Typ) { if (t == BeginToken) { if (tit.EndToken.IsNewlineAfter) { TypeValue = tit.Value; Rank += 5; tstart = tit.EndToken.Next; } } t = tit.EndToken; words++; if (tit.BeginToken != tit.EndToken) { words++; } if (tit.Chars.IsAllUpper) { upWords++; } continue; } if (tit.Typ == TitleItemToken.Types.Dust || tit.Typ == TitleItemToken.Types.Speciality) { if (t == BeginToken) { return(false); } Rank -= 20; if (tit.Typ == TitleItemToken.Types.Speciality) { Speciality = tit.Value; } t = tit.EndToken; continue; } if (tit.Typ == TitleItemToken.Types.Consultant || tit.Typ == TitleItemToken.Types.Boss || tit.Typ == TitleItemToken.Types.Editor) { t = tit.EndToken; if (t.Next != null && ((t.Next.IsCharOf(":") || t.Next.IsHiphen || t.WhitespacesAfterCount > 4))) { Rank -= 10; } else { Rank -= 2; } continue; } return(false); } Pullenti.Ner.Booklink.Internal.BookLinkToken blt = Pullenti.Ner.Booklink.Internal.BookLinkToken.TryParse(t, 0); if (blt != null) { if (blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.Misc || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.N || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.Pages) { Rank -= 10; } else if (blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.N || blt.Typ == Pullenti.Ner.Booklink.Internal.BookLinkTyp.PageRange) { Rank -= 20; } } if (t == BeginToken && Pullenti.Ner.Booklink.Internal.BookLinkToken.TryParseAuthor(t, Pullenti.Ner.Person.Internal.FioTemplateType.Undefined) != null) { Rank -= 20; } if (t.IsNewlineBefore && t != BeginToken) { lineNumber++; if (lineNumber > 4) { return(false); } if (t.Chars.IsAllLower) { Rank += 10; } else if (t.Previous.IsChar('.')) { Rank -= 10; } else if (t.Previous.IsCharOf(",-")) { Rank += 10; } else { Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Previous, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null && npt.EndChar >= t.EndChar) { Rank += 10; } } } if (t != BeginToken && t.NewlinesBeforeCount > minNewlinesCount) { Rank -= (t.NewlinesBeforeCount - minNewlinesCount); } Pullenti.Ner.Core.BracketSequenceToken bst = Pullenti.Ner.Core.BracketHelper.TryParse(t, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (bst != null && bst.IsQuoteType && bst.EndToken.EndChar <= EndToken.EndChar) { if (words == 0) { tstart = bst.BeginToken; Rank += 10; if (bst.EndToken == EndToken) { tend = EndToken; Rank += 10; } } } List <Pullenti.Ner.Referent> rli = t.GetReferents(); if (rli != null) { foreach (Pullenti.Ner.Referent r in rli) { if (r is Pullenti.Ner.Org.OrganizationReferent) { if (t.IsNewlineBefore) { Rank -= 10; } else { Rank -= 4; } continue; } if ((r is Pullenti.Ner.Geo.GeoReferent) || (r is Pullenti.Ner.Person.PersonReferent)) { if (t.IsNewlineBefore) { Rank -= 5; if (t.IsNewlineAfter || t.Next == null) { Rank -= 20; } else if (t.Next.IsHiphen || (t.Next is Pullenti.Ner.NumberToken) || (t.Next.GetReferent() is Pullenti.Ner.Date.DateReferent)) { Rank -= 20; } else if (t != BeginToken) { Rank -= 20; } } continue; } if ((r is Pullenti.Ner.Geo.GeoReferent) || (r is Pullenti.Ner.Denomination.DenominationReferent)) { continue; } if ((r is Pullenti.Ner.Uri.UriReferent) || (r is Pullenti.Ner.Phone.PhoneReferent)) { return(false); } if (t.IsNewlineBefore) { Rank -= 4; } else { Rank -= 2; } if (t == BeginToken && (EndToken.GetReferent() is Pullenti.Ner.Person.PersonReferent)) { Rank -= 10; } } words++; if (t.Chars.IsAllUpper) { upWords++; } if (t == BeginToken) { if (t.IsNewlineAfter) { Rank -= 10; } else if (t.Next != null && t.Next.IsChar('.') && t.Next.IsNewlineAfter) { Rank -= 10; } } continue; } if (t is Pullenti.Ner.NumberToken) { if ((t as Pullenti.Ner.NumberToken).Typ == Pullenti.Ner.NumberSpellingType.Words) { words++; if (t.Chars.IsAllUpper) { upWords++; } } else { notwords++; } continue; } Pullenti.Ner.Person.Internal.PersonAttrToken pat = Pullenti.Ner.Person.Internal.PersonAttrToken.TryAttach(t, null, Pullenti.Ner.Person.Internal.PersonAttrToken.PersonAttrAttachAttrs.No); if (pat != null) { if (t.IsNewlineBefore) { if (!pat.Morph.Case.IsUndefined && !pat.Morph.Case.IsNominative) { } else if (pat.Chars.IsAllUpper) { } else { Rank -= 20; } } else if (t.Chars.IsAllLower) { Rank--; } for (; t != null; t = t.Next) { words++; if (t.Chars.IsAllUpper) { upWords++; } if (t == pat.EndToken) { break; } } continue; } Pullenti.Ner.Org.Internal.OrgItemTypeToken oitt = Pullenti.Ner.Org.Internal.OrgItemTypeToken.TryAttach(t, true, null); if (oitt != null) { if (oitt.Morph.Number != Pullenti.Morph.MorphNumber.Plural && !oitt.IsDoubtRootWord) { if (!oitt.Morph.Case.IsUndefined && !oitt.Morph.Case.IsNominative) { words++; if (t.Chars.IsAllUpper) { upWords++; } } else { Rank -= 4; if (t == BeginToken) { Rank -= 5; } } } else { words += 1; if (t.Chars.IsAllUpper) { upWords++; } } t = oitt.EndToken; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { if (tt.IsChar('©')) { Rank -= 10; } if (tt.IsChar('_')) { Rank--; } if (tt.Chars.IsLetter) { if (tt.LengthChar > 2) { words++; if (t.Chars.IsAllUpper) { upWords++; } } } else if (!tt.IsChar(',')) { notwords++; } if (tt.IsPureVerb) { { Rank -= 30; words--; } break; } if (tt == EndToken) { if (tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction) { Rank -= 10; } else if (tt.IsChar('.')) { Rank += 5; } } else if (tt.IsCharOf("._")) { Rank -= 5; } } } Rank += words; Rank -= notwords; if ((words < 1) && (Rank < 50)) { return(false); } if (tstart == null || tend == null) { return(false); } if (tstart.EndChar > tend.EndChar) { return(false); } TitleItemToken tit1 = TitleItemToken.TryAttach(EndToken.Next); if (tit1 != null && ((tit1.Typ == TitleItemToken.Types.Typ || tit1.Typ == TitleItemToken.Types.Speciality))) { if (tit1.EndToken.IsNewlineAfter) { Rank += 15; } else { Rank += 10; } if (tit1.Typ == TitleItemToken.Types.Speciality) { Speciality = tit1.Value; } } if (upWords > 4 && upWords > ((int)((0.8 * words)))) { if (tstart.Previous != null && (tstart.Previous.GetReferent() is Pullenti.Ner.Person.PersonReferent)) { Rank += (5 + upWords); } } BeginNameToken = tstart; EndNameToken = tend; return(true); }
void MergeLetters() { bool beforeWord = false; StringBuilder tmp = new StringBuilder(); for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (!tt.Chars.IsLetter || tt.LengthChar != 1) { beforeWord = false; continue; } int i = t.WhitespacesBeforeCount; if (i > 2 || ((i == 2 && beforeWord))) { } else { beforeWord = false; continue; } i = 0; Pullenti.Ner.Token t1; tmp.Length = 0; tmp.Append(tt.GetSourceText()); for (t1 = t; t1.Next != null; t1 = t1.Next) { tt = t1.Next as Pullenti.Ner.TextToken; if (tt.LengthChar != 1 || tt.WhitespacesBeforeCount != 1) { break; } i++; tmp.Append(tt.GetSourceText()); } if (i > 3 || ((i > 1 && beforeWord))) { } else { beforeWord = false; continue; } beforeWord = false; List <Pullenti.Morph.MorphToken> mt = Pullenti.Morph.MorphologyService.Process(tmp.ToString(), null, null); if (mt == null || mt.Count != 1) { t = t1; continue; } foreach (Pullenti.Morph.MorphWordForm wf in mt[0].WordForms) { if (wf.IsInDictionary) { beforeWord = true; break; } } if (!beforeWord) { t = t1; continue; } tt = new Pullenti.Ner.TextToken(mt[0], this, t.BeginChar, t1.EndChar); if (t == FirstToken) { FirstToken = tt; } else { tt.Previous = t.Previous; } tt.Next = t1.Next; t = tt; } }
void CorrectWordsByMorph(Pullenti.Morph.MorphLang lang) { for (Pullenti.Ner.Token tt = FirstToken; tt != null; tt = tt.Next) { if (!(tt is Pullenti.Ner.TextToken)) { continue; } if (tt.Morph.ContainsAttr("прдктв.", null)) { continue; } Pullenti.Morph.MorphClass dd = tt.GetMorphClassInDictionary(); if (!dd.IsUndefined || (tt.LengthChar < 4)) { continue; } if (tt.Morph.Class.IsProperSurname && !tt.Chars.IsAllLower) { continue; } if (tt.Chars.IsAllUpper) { continue; } string corw = Pullenti.Morph.MorphologyService.CorrectWord((tt as Pullenti.Ner.TextToken).Term, (tt.Morph.Language.IsUndefined ? lang : tt.Morph.Language)); if (corw == null) { continue; } List <Pullenti.Morph.MorphToken> ccc = Pullenti.Morph.MorphologyService.Process(corw, lang, null); if (ccc == null || ccc.Count != 1) { continue; } Pullenti.Ner.TextToken tt1 = new Pullenti.Ner.TextToken(ccc[0], this, tt.BeginChar, tt.EndChar) { Chars = tt.Chars, Term0 = (tt as Pullenti.Ner.TextToken).Term }; Pullenti.Morph.MorphClass mc = tt1.GetMorphClassInDictionary(); if (mc.IsProperSurname) { continue; } if (tt == FirstToken) { FirstToken = tt1; } else { tt.Previous.Next = tt1; } tt1.Next = tt.Next; tt = tt1; if (CorrectedTokens == null) { CorrectedTokens = new Dictionary <Pullenti.Ner.Token, string>(); } CorrectedTokens.Add(tt, tt.GetSourceText()); } }
static UriItemToken _AttachUriContent(Pullenti.Ner.Token t0, string chars, bool canBeWhitespaces = false) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; UriItemToken dom = AttachDomainName(t0, true, canBeWhitespaces); if (dom != null) { if (dom.Value.Length < 3) { return(null); } } char openChar = (char)0; Pullenti.Ner.Token t = t0; if (dom != null) { t = dom.EndToken.Next; } for (; t != null; t = t.Next) { if (t != t0 && t.IsWhitespaceBefore) { if (t.IsNewlineBefore || !canBeWhitespaces) { break; } if (dom == null) { break; } if (t.Previous.IsHiphen) { } else if (t.Previous.IsCharOf(",;")) { break; } else if (t.Previous.IsChar('.') && t.Chars.IsLetter && t.LengthChar == 2) { } else { bool ok = false; Pullenti.Ner.Token tt1 = t; if (t.IsCharOf("\\/")) { tt1 = t.Next; } Pullenti.Ner.Token tt0 = tt1; for (; tt1 != null; tt1 = tt1.Next) { if (tt1 != tt0 && tt1.IsWhitespaceBefore) { break; } if (tt1 is Pullenti.Ner.NumberToken) { continue; } if (!(tt1 is Pullenti.Ner.TextToken)) { break; } string term1 = (tt1 as Pullenti.Ner.TextToken).Term; if (((term1 == "HTM" || term1 == "HTML" || term1 == "SHTML") || term1 == "ASP" || term1 == "ASPX") || term1 == "JSP") { ok = true; break; } if (!tt1.Chars.IsLetter) { if (tt1.IsCharOf("\\/")) { ok = true; break; } if (!tt1.IsCharOf(chars)) { break; } } else if (!tt1.Chars.IsLatinLetter) { break; } } if (!ok) { break; } } } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; txt.Append(nt.GetSourceText()); t1 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { Pullenti.Ner.ReferentToken rt = t as Pullenti.Ner.ReferentToken; if (rt != null && rt.BeginToken.IsValue("РФ", null)) { if (txt.Length > 0 && txt[txt.Length - 1] == '.') { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } } if (rt != null && rt.Chars.IsLatinLetter && rt.BeginToken == rt.EndToken) { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } break; } string src = tt.GetSourceText(); char ch = src[0]; if (!char.IsLetter(ch)) { if (chars.IndexOf(ch) < 0) { break; } if (ch == '(' || ch == '[') { openChar = ch; } else if (ch == ')') { if (openChar != '(') { break; } openChar = (char)0; } else if (ch == ']') { if (openChar != '[') { break; } openChar = (char)0; } } txt.Append(src); t1 = t; } if (txt.Length == 0) { return(dom); } int i; for (i = 0; i < txt.Length; i++) { if (char.IsLetterOrDigit(txt[i])) { break; } } if (i >= txt.Length) { return(dom); } if (txt[txt.Length - 1] == '.' || txt[txt.Length - 1] == '/') { txt.Length--; t1 = t1.Previous; } if (dom != null) { txt.Insert(0, dom.Value); } string tmp = txt.ToString(); if (tmp.StartsWith("\\\\")) { txt.Replace("\\\\", "//"); tmp = txt.ToString(); } if (tmp.StartsWith("//")) { tmp = tmp.Substring(2); } if (string.Compare(tmp, "WWW", true) == 0) { return(null); } UriItemToken res = new UriItemToken(t0, t1) { Value = txt.ToString() }; return(res); }
public static UriItemToken AttachDomainName(Pullenti.Ner.Token t0, bool check, bool canBeWhitspaces) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; int ipCount = 0; bool isIp = true; for (Pullenti.Ner.Token t = t0; t != null; t = t.Next) { if (t.IsWhitespaceBefore && t != t0) { bool ok = false; if (!t.IsNewlineBefore && canBeWhitspaces) { for (Pullenti.Ner.Token tt1 = t; tt1 != null; tt1 = tt1.Next) { if (tt1.IsChar('.') || tt1.IsHiphen) { continue; } if (tt1.IsWhitespaceBefore) { if (tt1.IsNewlineBefore) { break; } if (tt1.Previous != null && ((tt1.Previous.IsChar('.') || tt1.Previous.IsHiphen))) { } else { break; } } if (!(tt1 is Pullenti.Ner.TextToken)) { break; } if (m_StdGroups.TryParse(tt1, Pullenti.Ner.Core.TerminParseAttr.No) != null) { ok = true; break; } if (!tt1.Chars.IsLatinLetter) { break; } } } if (!ok) { break; } } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if (nt.IntValue == null) { break; } txt.Append(nt.GetSourceText()); t1 = t; if (nt.Typ == Pullenti.Ner.NumberSpellingType.Digit && nt.IntValue.Value >= 0 && (nt.IntValue.Value < 256)) { ipCount++; } else { isIp = false; } continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } string src = (tt as Pullenti.Ner.TextToken).Term; char ch = src[0]; if (!char.IsLetter(ch)) { if (".-_".IndexOf(ch) < 0) { break; } if (ch != '.') { isIp = false; } if (ch == '-') { if (string.Compare(txt.ToString(), "vk.com", true) == 0) { return new UriItemToken(t0, t1) { Value = txt.ToString().ToLower() } } ; } } else { isIp = false; } txt.Append(src.ToLower()); t1 = t; } if (txt.Length == 0) { return(null); } if (ipCount != 4) { isIp = false; } int i; int points = 0; for (i = 0; i < txt.Length; i++) { if (txt[i] == '.') { if (i == 0) { return(null); } if (i >= (txt.Length - 1)) { txt.Length--; t1 = t1.Previous; break; } if (txt[i - 1] == '.' || txt[i + 1] == '.') { return(null); } points++; } } if (points == 0) { return(null); } string uri = txt.ToString(); if (check) { bool ok = isIp; if (!isIp) { if (txt.ToString() == "localhost") { ok = true; } } if (!ok && t1.Previous != null && t1.Previous.IsChar('.')) { if (m_StdGroups.TryParse(t1, Pullenti.Ner.Core.TerminParseAttr.No) != null) { ok = true; } } if (!ok) { return(null); } } return(new UriItemToken(t0, t1) { Value = txt.ToString().ToLower() }); }
static VerbPhraseToken TryParseRu(Pullenti.Ner.Token t, bool canBePartition, bool canBeAdjPartition, bool forceParse) { VerbPhraseToken res = null; Pullenti.Ner.Token t0 = t; Pullenti.Ner.Token not = null; bool hasVerb = false; bool verbBeBefore = false; PrepositionToken prep = null; for (; t != null; t = t.Next) { if (!(t is Pullenti.Ner.TextToken)) { break; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; bool isParticiple = false; if (tt.Term == "НЕ") { not = t; continue; } int ty = 0; string norm = null; Pullenti.Morph.MorphClass mc = tt.GetMorphClassInDictionary(); if (tt.Term == "НЕТ") { if (hasVerb) { break; } ty = 1; } else if (tt.Term == "ДОПУСТИМО") { ty = 3; } else if (mc.IsAdverb && !mc.IsVerb) { ty = 2; } else if (tt.IsPureVerb || tt.IsVerbBe) { ty = 1; if (hasVerb) { if (!tt.Morph.ContainsAttr("инф.", null)) { if (verbBeBefore) { } else { break; } } } } else if (mc.IsVerb) { if (mc.IsPreposition || mc.IsMisc || mc.IsPronoun) { } else if (mc.IsNoun) { if (tt.Term == "СТАЛИ" || tt.Term == "СТЕКЛО" || tt.Term == "БЫЛИ") { ty = 1; } else if (!tt.Chars.IsAllLower && !MiscHelper.CanBeStartOfSentence(tt)) { ty = 1; } else if (mc.IsAdjective && canBePartition) { ty = 1; } else if (forceParse) { ty = 1; } } else if (mc.IsProper) { if (tt.Chars.IsAllLower) { ty = 1; } } else { ty = 1; } if (mc.IsAdjective) { isParticiple = true; } if (!tt.Morph.Case.IsUndefined) { isParticiple = true; } if (!canBePartition && isParticiple) { break; } if (hasVerb) { if (tt.Morph.ContainsAttr("инф.", null)) { } else if (!isParticiple) { } else { break; } } } else if ((mc.IsAdjective && tt.Morph.ContainsAttr("к.ф.", null) && tt.Term.EndsWith("О")) && NounPhraseHelper.TryParse(tt, NounPhraseParseAttr.No, 0, null) == null) { ty = 2; } else if (mc.IsAdjective && ((canBePartition || canBeAdjPartition))) { if (tt.Morph.ContainsAttr("к.ф.", null) && !canBeAdjPartition) { break; } norm = tt.GetNormalCaseText(Pullenti.Morph.MorphClass.Adjective, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Masculine, false); if (norm.EndsWith("ЙШИЙ")) { } else { List <Pullenti.Semantic.Utils.DerivateGroup> grs = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm, true, null); if (grs != null && grs.Count > 0) { bool hVerb = false; bool hPart = false; foreach (Pullenti.Semantic.Utils.DerivateGroup gr in grs) { foreach (Pullenti.Semantic.Utils.DerivateWord w in gr.Words) { if (w.Class.IsAdjective && w.Class.IsVerb) { if (w.Spelling == norm) { hPart = true; } } else if (w.Class.IsVerb) { hVerb = true; } } } if (hPart && hVerb) { ty = 3; } else if (canBeAdjPartition) { ty = 3; } if (ty != 3 && !string.IsNullOrEmpty(grs[0].Prefix) && norm.StartsWith(grs[0].Prefix)) { hVerb = false; hPart = false; string norm1 = norm.Substring(grs[0].Prefix.Length); grs = Pullenti.Semantic.Utils.DerivateService.FindDerivates(norm1, true, null); if (grs != null && grs.Count > 0) { foreach (Pullenti.Semantic.Utils.DerivateGroup gr in grs) { foreach (Pullenti.Semantic.Utils.DerivateWord w in gr.Words) { if (w.Class.IsAdjective && w.Class.IsVerb) { if (w.Spelling == norm1) { hPart = true; } } else if (w.Class.IsVerb) { hVerb = true; } } } } if (hPart && hVerb) { ty = 3; } } } } } if (ty == 0 && t == t0 && canBePartition) { prep = PrepositionHelper.TryParse(t); if (prep != null) { t = prep.EndToken; continue; } } if (ty == 0) { break; } if (res == null) { res = new VerbPhraseToken(t0, t); } res.EndToken = t; VerbPhraseItemToken it = new VerbPhraseItemToken(t, t) { Morph = new Pullenti.Ner.MorphCollection(t.Morph) }; if (not != null) { it.BeginToken = not; it.Not = true; not = null; } it.IsAdverb = ty == 2; if (prep != null && !t.Morph.Case.IsUndefined && res.Items.Count == 0) { if (((prep.NextCase & t.Morph.Case)).IsUndefined) { return(null); } it.Morph.RemoveItems(prep.NextCase); res.Preposition = prep; } if (norm == null) { norm = t.GetNormalCaseText((ty == 3 ? Pullenti.Morph.MorphClass.Adjective : (ty == 2 ? Pullenti.Morph.MorphClass.Adverb : Pullenti.Morph.MorphClass.Verb)), Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Masculine, false); if (ty == 1 && !tt.Morph.Case.IsUndefined) { Pullenti.Morph.MorphWordForm mi = new Pullenti.Morph.MorphWordForm() { Case = Pullenti.Morph.MorphCase.Nominative, Number = Pullenti.Morph.MorphNumber.Singular, Gender = Pullenti.Morph.MorphGender.Masculine }; foreach (Pullenti.Morph.MorphBaseInfo mit in tt.Morph.Items) { if (mit is Pullenti.Morph.MorphWordForm) { mi.Misc = (mit as Pullenti.Morph.MorphWordForm).Misc; break; } } string nnn = Pullenti.Morph.MorphologyService.GetWordform("КК" + (t as Pullenti.Ner.TextToken).Term, mi); if (nnn != null) { norm = nnn.Substring(2); } } } it.Normal = norm; res.Items.Add(it); if (!hasVerb && ((ty == 1 || ty == 3))) { res.Morph = it.Morph; hasVerb = true; } if (ty == 1 || ty == 3) { if (ty == 1 && tt.IsVerbBe) { verbBeBefore = true; } else { verbBeBefore = false; } } } if (!hasVerb) { return(null); } for (int i = res.Items.Count - 1; i > 0; i--) { if (res.Items[i].IsAdverb) { res.Items.RemoveAt(i); res.EndToken = res.Items[i - 1].EndToken; } else { break; } } return(res); }
public static UriItemToken AttachISBN(Pullenti.Ner.Token t0) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; int digs = 0; for (Pullenti.Ner.Token t = t0; t != null; t = t.Next) { if (t.IsTableControlChar) { break; } if (t.IsNewlineBefore && t != t0) { if (t.Previous != null && t.Previous.IsHiphen) { } else { break; } } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if (nt.Typ != Pullenti.Ner.NumberSpellingType.Digit || !nt.Morph.Class.IsUndefined) { break; } string d = nt.GetSourceText(); txt.Append(d); digs += d.Length; t1 = t; if (digs > 13) { break; } continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } string s = tt.Term; if (s != "-" && s != "Х" && s != "X") { break; } if (s == "Х") { s = "X"; } txt.Append(s); t1 = t; if (s != "-") { break; } } int i; int dig = 0; for (i = 0; i < txt.Length; i++) { if (char.IsDigit(txt[i])) { dig++; } } if (dig < 7) { return(null); } return(new UriItemToken(t0, t1) { Value = txt.ToString() }); }
public static TitleItemToken TryAttach(Pullenti.Ner.Token t) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { Pullenti.Ner.Token t1 = (Pullenti.Ner.Token)tt; if (tt.Term == "ТЕМА") { TitleItemToken tit = TryAttach(tt.Next); if (tit != null && tit.Typ == Types.Typ) { t1 = tit.EndToken; if (t1.Next != null && t1.Next.IsChar(':')) { t1 = t1.Next; } return(new TitleItemToken(t, t1, Types.TypAndTheme) { Value = tit.Value }); } if (tt.Next != null && tt.Next.IsChar(':')) { t1 = tt.Next; } return(new TitleItemToken(tt, t1, Types.Theme)); } if (tt.Term == "ПО" || tt.Term == "НА") { if (tt.Next != null && tt.Next.IsValue("ТЕМА", null)) { t1 = tt.Next; if (t1.Next != null && t1.Next.IsChar(':')) { t1 = t1.Next; } return(new TitleItemToken(tt, t1, Types.Theme)); } } if (tt.Term == "ПЕРЕВОД" || tt.Term == "ПЕР") { Pullenti.Ner.Token tt2 = tt.Next; if (tt2 != null && tt2.IsChar('.')) { tt2 = tt2.Next; } if (tt2 is Pullenti.Ner.TextToken) { if ((tt2 as Pullenti.Ner.TextToken).Term == "C" || (tt2 as Pullenti.Ner.TextToken).Term == "С") { tt2 = tt2.Next; if (tt2 is Pullenti.Ner.TextToken) { return(new TitleItemToken(t, tt2, Types.Translate)); } } } } if (tt.Term == "СЕКЦИЯ" || tt.Term == "SECTION" || tt.Term == "СЕКЦІЯ") { t1 = tt.Next; if (t1 != null && t1.IsChar(':')) { t1 = t1.Next; } Pullenti.Ner.Core.BracketSequenceToken br = Pullenti.Ner.Core.BracketHelper.TryParse(t1, Pullenti.Ner.Core.BracketParseAttr.No, 100); if (br != null) { t1 = br.EndToken; } else if (t1 != tt.Next) { for (; t1 != null; t1 = t1.Next) { if (t1.IsNewlineAfter) { break; } } if (t1 == null) { return(null); } } if (t1 != tt.Next) { return(new TitleItemToken(tt, t1, Types.Dust)); } } t1 = null; if (tt.IsValue("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")) { t1 = tt.Next; } else if (tt.Morph.Class.IsPreposition && tt.Next != null && tt.Next.IsValue("СПЕЦИАЛЬНОСТЬ", "СПЕЦІАЛЬНІСТЬ")) { t1 = tt.Next.Next; } else if (tt.IsChar('/') && tt.IsNewlineBefore) { t1 = tt.Next; } if (t1 != null) { if (t1.IsCharOf(":") || t1.IsHiphen) { t1 = t1.Next; } TitleItemToken spec = TryAttachSpeciality(t1, true); if (spec != null) { spec.BeginToken = t; return(spec); } } } TitleItemToken sss = TryAttachSpeciality(t, false); if (sss != null) { return(sss); } if (t is Pullenti.Ner.ReferentToken) { return(null); } Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt != null) { string s = npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Undefined, Pullenti.Morph.MorphGender.Undefined, false); Pullenti.Ner.Core.TerminToken tok = m_Termins.TryParse(npt.EndToken, Pullenti.Ner.Core.TerminParseAttr.No); if (tok != null) { Types ty = (Types)tok.Termin.Tag; if (ty == Types.Typ) { TitleItemToken tit = TryAttach(tok.EndToken.Next); if (tit != null && tit.Typ == Types.Theme) { return new TitleItemToken(npt.BeginToken, tit.EndToken, Types.TypAndTheme) { Value = s } } ; if (s == "РАБОТА" || s == "РОБОТА" || s == "ПРОЕКТ") { return(null); } Pullenti.Ner.Token t1 = tok.EndToken; if (s == "ДИССЕРТАЦИЯ" || s == "ДИСЕРТАЦІЯ") { int err = 0; for (Pullenti.Ner.Token ttt = t1.Next; ttt != null; ttt = ttt.Next) { if (ttt.Morph.Class.IsPreposition) { continue; } if (ttt.IsValue("СОИСКАНИЕ", "")) { continue; } Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(ttt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt1 != null && npt1.Noun.IsValue("СТЕПЕНЬ", "СТУПІНЬ")) { t1 = (ttt = npt1.EndToken); continue; } Pullenti.Ner.ReferentToken rt = t1.Kit.ProcessReferent("PERSON", ttt); if (rt != null && (rt.Referent is Pullenti.Ner.Person.PersonPropertyReferent)) { Pullenti.Ner.Person.PersonPropertyReferent ppr = rt.Referent as Pullenti.Ner.Person.PersonPropertyReferent; if (ppr.Name == "доктор наук") { t1 = rt.EndToken; s = "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"; break; } else if (ppr.Name == "кандидат наук") { t1 = rt.EndToken; s = "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"; break; } else if (ppr.Name == "магистр") { t1 = rt.EndToken; s = "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"; break; } } if (ttt.IsValue("ДОКТОР", null) || ttt.IsValue("КАНДИДАТ", null) || ttt.IsValue("МАГИСТР", "МАГІСТР")) { t1 = ttt; npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(ttt.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); if (npt1 != null && npt1.EndToken.IsValue("НАУК", null)) { t1 = npt1.EndToken; } s = (ttt.IsValue("МАГИСТР", "МАГІСТР") ? "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ" : (ttt.IsValue("ДОКТОР", null) ? "ДОКТОРСКАЯ ДИССЕРТАЦИЯ" : "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ")); break; } if ((++err) > 3) { break; } } } if (t1.Next != null && t1.Next.IsChar('.')) { t1 = t1.Next; } if (s.EndsWith("ОТЧЕТ") && t1.Next != null && t1.Next.IsValue("О", null)) { Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t1.Next, Pullenti.Ner.Core.NounPhraseParseAttr.ParsePreposition, 0, null); if (npt1 != null && npt1.Morph.Case.IsPrepositional) { t1 = npt1.EndToken; } } return(new TitleItemToken(npt.BeginToken, t1, ty) { Value = s }); } } } Pullenti.Ner.Core.TerminToken tok1 = m_Termins.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No); if (tok1 != null) { Pullenti.Ner.Token t1 = tok1.EndToken; TitleItemToken re = new TitleItemToken(tok1.BeginToken, t1, (Types)tok1.Termin.Tag); return(re); } if (Pullenti.Ner.Core.BracketHelper.CanBeStartOfSequence(t, false, false)) { tok1 = m_Termins.TryParse(t.Next, Pullenti.Ner.Core.TerminParseAttr.No); if (tok1 != null && Pullenti.Ner.Core.BracketHelper.CanBeEndOfSequence(tok1.EndToken.Next, false, null, false)) { Pullenti.Ner.Token t1 = tok1.EndToken.Next; return(new TitleItemToken(tok1.BeginToken, t1, (Types)tok1.Termin.Tag)); } } return(null); }
public Pullenti.Ner.ReferentToken TryAttach(Pullenti.Ner.Token t, bool forOntology = false) { if (t == null) { return(null); } Pullenti.Ner.ReferentToken rt0 = this.TryAttachSpec(t); if (rt0 != null) { return(rt0); } if (t.Chars.IsAllLower) { if (!t.IsWhitespaceAfter && (t.Next is Pullenti.Ner.NumberToken)) { if (t.Previous == null || t.IsWhitespaceBefore || t.Previous.IsCharOf(",:")) { } else { return(null); } } else { return(null); } } StringBuilder tmp = new StringBuilder(); Pullenti.Ner.Token t1 = t; bool hiph = false; bool ok = true; int nums = 0; int chars = 0; for (Pullenti.Ner.Token w = t1.Next; w != null; w = w.Next) { if (w.IsWhitespaceBefore && !forOntology) { break; } if (w.IsCharOf("/\\_") || w.IsHiphen) { hiph = true; tmp.Append('-'); continue; } hiph = false; Pullenti.Ner.NumberToken nt = w as Pullenti.Ner.NumberToken; if (nt != null) { if (nt.Typ != Pullenti.Ner.NumberSpellingType.Digit) { break; } t1 = nt; tmp.Append(nt.GetSourceText()); nums++; continue; } Pullenti.Ner.TextToken tt = w as Pullenti.Ner.TextToken; if (tt == null) { break; } if (tt.LengthChar > 3) { ok = false; break; } if (!char.IsLetter(tt.Term[0])) { if (tt.IsCharOf(",:") || Pullenti.Ner.Core.BracketHelper.CanBeEndOfSequence(tt, false, null, false)) { break; } if (!tt.IsCharOf("+*&^#@!")) { ok = false; break; } chars++; } t1 = tt; tmp.Append(tt.GetSourceText()); } if (!forOntology) { if ((tmp.Length < 1) || !ok || hiph) { return(null); } if (tmp.Length > 12) { return(null); } char last = tmp[tmp.Length - 1]; if (last == '!') { return(null); } if ((nums + chars) == 0) { return(null); } if (!this.CheckAttach(t, t1)) { return(null); } } DenominationReferent newDr = new DenominationReferent(); newDr.AddValue(t, t1); return(new Pullenti.Ner.ReferentToken(newDr, t, t1)); }
public static OrgItemNumberToken TryAttach(Pullenti.Ner.Token t, bool canBePureNumber = false, OrgItemTypeToken typ = null) { if (t == null) { return(null); } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { Pullenti.Ner.Token t1 = Pullenti.Ner.Core.MiscHelper.CheckNumberPrefix(tt); if ((t1 is Pullenti.Ner.NumberToken) && !t1.IsNewlineBefore) { OrgItemNumberToken res = new OrgItemNumberToken(tt, t1) { Number = (t1 as Pullenti.Ner.NumberToken).Value.ToString() }; if (t1.Next != null && t1.Next.IsCharOf("\\/") && (t1.Next.Next is Pullenti.Ner.NumberToken)) { if (typ != null && ((typ.Typ == "офис" || typ.Typ == "банк" || typ.Typ == "отделение"))) { res.EndToken = res.EndToken.Next.Next; res.Number = string.Format("{0}/{1}", res.Number, (res.EndToken as Pullenti.Ner.NumberToken).Value); } } return(res); } } if ((t.IsHiphen && (t.Next is Pullenti.Ner.NumberToken) && !t.IsWhitespaceBefore) && !t.IsWhitespaceAfter) { if (Pullenti.Ner.Core.NumberHelper.TryParseAge(t.Next) == null) { return new OrgItemNumberToken(t, t.Next) { Number = (t.Next as Pullenti.Ner.NumberToken).Value.ToString() } } ; } if (t is Pullenti.Ner.NumberToken) { if ((!t.IsWhitespaceBefore && t.Previous != null && t.Previous.IsHiphen)) { return new OrgItemNumberToken(t, t) { Number = (t as Pullenti.Ner.NumberToken).Value.ToString() } } ; if (typ != null && typ.Typ != null && (((typ.Typ == "войсковая часть" || typ.Typ == "військова частина" || typ.Typ.Contains("колония")) || typ.Typ.Contains("колонія") || typ.Typ.Contains("школа")))) { if (t.LengthChar >= 4 || t.LengthChar <= 6) { OrgItemNumberToken res = new OrgItemNumberToken(t, t) { Number = (t as Pullenti.Ner.NumberToken).Value.ToString() }; if (t.Next != null && ((t.Next.IsHiphen || t.Next.IsCharOf("\\/"))) && !t.Next.IsWhitespaceAfter) { if ((t.Next.Next is Pullenti.Ner.NumberToken) && ((t.LengthChar + t.Next.Next.LengthChar) < 9)) { res.EndToken = t.Next.Next; res.Number = string.Format("{0}-{1}", res.Number, (res.EndToken as Pullenti.Ner.NumberToken).Value); } else if ((t.Next.Next is Pullenti.Ner.TextToken) && t.Next.Next.LengthChar == 1 && t.Next.Next.Chars.IsLetter) { res.EndToken = t.Next.Next; res.Number = string.Format("{0}{1}", res.Number, (res.EndToken as Pullenti.Ner.TextToken).Term); } } else if (((t.Next is Pullenti.Ner.TextToken) && t.Next.LengthChar == 1 && t.Next.Chars.IsLetter) && !t.IsWhitespaceAfter) { res.EndToken = t.Next; res.Number = string.Format("{0}{1}", res.Number, (res.EndToken as Pullenti.Ner.TextToken).Term); } return(res); } } } if (((t is Pullenti.Ner.TextToken) && t.LengthChar == 1 && t.Chars.IsLetter) && ((!t.IsWhitespaceAfter || (((t.WhitespacesAfterCount < 2) && t.Chars.IsAllUpper))))) { if (typ != null && typ.Typ != null && (((typ.Typ == "войсковая часть" || typ.Typ == "військова частина" || typ.Typ.Contains("колония")) || typ.Typ.Contains("колонія")))) { Pullenti.Ner.Token tt1 = t.Next; if (tt1 != null && tt1.IsHiphen) { tt1 = tt1.Next; } if (tt1 is Pullenti.Ner.NumberToken) { OrgItemNumberToken res = new OrgItemNumberToken(t, tt1); res.Number = string.Format("{0}{1}", (t as Pullenti.Ner.TextToken).Term, (tt1 as Pullenti.Ner.NumberToken).Value); return(res); } } } return(null); } } }