public static UriItemToken AttachBBK(Pullenti.Ner.Token t0) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; int digs = 0; for (Pullenti.Ner.Token t = t0; t != null; t = t.Next) { if (t.IsNewlineBefore && t != t0) { break; } if (t.IsTableControlChar) { break; } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if (nt.Typ != Pullenti.Ner.NumberSpellingType.Digit || !nt.Morph.Class.IsUndefined) { break; } string d = nt.GetSourceText(); txt.Append(d); digs += d.Length; t1 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } if (tt.IsChar(',')) { break; } if (tt.IsChar('(')) { if (!(tt.Next is Pullenti.Ner.NumberToken)) { break; } } string s = tt.GetSourceText(); if (char.IsLetter(s[0])) { if (tt.IsWhitespaceBefore) { break; } } txt.Append(s); t1 = t; } if ((txt.Length < 3) || (digs < 2)) { return(null); } if (txt[txt.Length - 1] == '.') { txt.Length--; t1 = t1.Previous; } return(new UriItemToken(t0, t1) { Value = txt.ToString() }); }
public Pullenti.Ner.ReferentToken TryAttach(Pullenti.Ner.Token t, bool forOntology = false) { if (t == null) { return(null); } Pullenti.Ner.ReferentToken rt0 = this.TryAttachSpec(t); if (rt0 != null) { return(rt0); } if (t.Chars.IsAllLower) { if (!t.IsWhitespaceAfter && (t.Next is Pullenti.Ner.NumberToken)) { if (t.Previous == null || t.IsWhitespaceBefore || t.Previous.IsCharOf(",:")) { } else { return(null); } } else { return(null); } } StringBuilder tmp = new StringBuilder(); Pullenti.Ner.Token t1 = t; bool hiph = false; bool ok = true; int nums = 0; int chars = 0; for (Pullenti.Ner.Token w = t1.Next; w != null; w = w.Next) { if (w.IsWhitespaceBefore && !forOntology) { break; } if (w.IsCharOf("/\\_") || w.IsHiphen) { hiph = true; tmp.Append('-'); continue; } hiph = false; Pullenti.Ner.NumberToken nt = w as Pullenti.Ner.NumberToken; if (nt != null) { if (nt.Typ != Pullenti.Ner.NumberSpellingType.Digit) { break; } t1 = nt; tmp.Append(nt.GetSourceText()); nums++; continue; } Pullenti.Ner.TextToken tt = w as Pullenti.Ner.TextToken; if (tt == null) { break; } if (tt.LengthChar > 3) { ok = false; break; } if (!char.IsLetter(tt.Term[0])) { if (tt.IsCharOf(",:") || Pullenti.Ner.Core.BracketHelper.CanBeEndOfSequence(tt, false, null, false)) { break; } if (!tt.IsCharOf("+*&^#@!")) { ok = false; break; } chars++; } t1 = tt; tmp.Append(tt.GetSourceText()); } if (!forOntology) { if ((tmp.Length < 1) || !ok || hiph) { return(null); } if (tmp.Length > 12) { return(null); } char last = tmp[tmp.Length - 1]; if (last == '!') { return(null); } if ((nums + chars) == 0) { return(null); } if (!this.CheckAttach(t, t1)) { return(null); } } DenominationReferent newDr = new DenominationReferent(); newDr.AddValue(t, t1); return(new Pullenti.Ner.ReferentToken(newDr, t, t1)); }
public static List <UriItemToken> AttachMailUsers(Pullenti.Ner.Token t1) { if (t1 == null) { return(null); } if (t1.IsChar('}')) { List <UriItemToken> res0 = AttachMailUsers(t1.Previous); if (res0 == null) { return(null); } t1 = res0[0].BeginToken.Previous; for (; t1 != null; t1 = t1.Previous) { if (t1.IsChar('{')) { res0[0].BeginToken = t1; return(res0); } if (t1.IsCharOf(";,")) { continue; } List <UriItemToken> res1 = AttachMailUsers(t1); if (res1 == null) { return(null); } res0.Insert(0, res1[0]); t1 = res1[0].BeginToken; } return(null); } StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t0 = t1; for (Pullenti.Ner.Token t = t1; t != null; t = t.Previous) { if (t.IsWhitespaceAfter) { break; } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; txt.Insert(0, nt.GetSourceText()); t0 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { break; } string src = tt.GetSourceText(); char ch = src[0]; if (!char.IsLetter(ch)) { if (".-_".IndexOf(ch) < 0) { break; } } txt.Insert(0, src); t0 = t; } if (txt.Length == 0) { return(null); } List <UriItemToken> res = new List <UriItemToken>(); res.Add(new UriItemToken(t0, t1) { Value = txt.ToString().ToLower() }); return(res); }
static UriItemToken _AttachUriContent(Pullenti.Ner.Token t0, string chars, bool canBeWhitespaces = false) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; UriItemToken dom = AttachDomainName(t0, true, canBeWhitespaces); if (dom != null) { if (dom.Value.Length < 3) { return(null); } } char openChar = (char)0; Pullenti.Ner.Token t = t0; if (dom != null) { t = dom.EndToken.Next; } for (; t != null; t = t.Next) { if (t != t0 && t.IsWhitespaceBefore) { if (t.IsNewlineBefore || !canBeWhitespaces) { break; } if (dom == null) { break; } if (t.Previous.IsHiphen) { } else if (t.Previous.IsCharOf(",;")) { break; } else if (t.Previous.IsChar('.') && t.Chars.IsLetter && t.LengthChar == 2) { } else { bool ok = false; Pullenti.Ner.Token tt1 = t; if (t.IsCharOf("\\/")) { tt1 = t.Next; } Pullenti.Ner.Token tt0 = tt1; for (; tt1 != null; tt1 = tt1.Next) { if (tt1 != tt0 && tt1.IsWhitespaceBefore) { break; } if (tt1 is Pullenti.Ner.NumberToken) { continue; } if (!(tt1 is Pullenti.Ner.TextToken)) { break; } string term1 = (tt1 as Pullenti.Ner.TextToken).Term; if (((term1 == "HTM" || term1 == "HTML" || term1 == "SHTML") || term1 == "ASP" || term1 == "ASPX") || term1 == "JSP") { ok = true; break; } if (!tt1.Chars.IsLetter) { if (tt1.IsCharOf("\\/")) { ok = true; break; } if (!tt1.IsCharOf(chars)) { break; } } else if (!tt1.Chars.IsLatinLetter) { break; } } if (!ok) { break; } } } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; txt.Append(nt.GetSourceText()); t1 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { Pullenti.Ner.ReferentToken rt = t as Pullenti.Ner.ReferentToken; if (rt != null && rt.BeginToken.IsValue("РФ", null)) { if (txt.Length > 0 && txt[txt.Length - 1] == '.') { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } } if (rt != null && rt.Chars.IsLatinLetter && rt.BeginToken == rt.EndToken) { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } break; } string src = tt.GetSourceText(); char ch = src[0]; if (!char.IsLetter(ch)) { if (chars.IndexOf(ch) < 0) { break; } if (ch == '(' || ch == '[') { openChar = ch; } else if (ch == ')') { if (openChar != '(') { break; } openChar = (char)0; } else if (ch == ']') { if (openChar != '[') { break; } openChar = (char)0; } } txt.Append(src); t1 = t; } if (txt.Length == 0) { return(dom); } int i; for (i = 0; i < txt.Length; i++) { if (char.IsLetterOrDigit(txt[i])) { break; } } if (i >= txt.Length) { return(dom); } if (txt[txt.Length - 1] == '.' || txt[txt.Length - 1] == '/') { txt.Length--; t1 = t1.Previous; } if (dom != null) { txt.Insert(0, dom.Value); } string tmp = txt.ToString(); if (tmp.StartsWith("\\\\")) { txt.Replace("\\\\", "//"); tmp = txt.ToString(); } if (tmp.StartsWith("//")) { tmp = tmp.Substring(2); } if (string.Compare(tmp, "WWW", true) == 0) { return(null); } UriItemToken res = new UriItemToken(t0, t1) { Value = txt.ToString() }; return(res); }
void MergeLetters() { bool beforeWord = false; StringBuilder tmp = new StringBuilder(); for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (!tt.Chars.IsLetter || tt.LengthChar != 1) { beforeWord = false; continue; } int i = t.WhitespacesBeforeCount; if (i > 2 || ((i == 2 && beforeWord))) { } else { beforeWord = false; continue; } i = 0; Pullenti.Ner.Token t1; tmp.Length = 0; tmp.Append(tt.GetSourceText()); for (t1 = t; t1.Next != null; t1 = t1.Next) { tt = t1.Next as Pullenti.Ner.TextToken; if (tt.LengthChar != 1 || tt.WhitespacesBeforeCount != 1) { break; } i++; tmp.Append(tt.GetSourceText()); } if (i > 3 || ((i > 1 && beforeWord))) { } else { beforeWord = false; continue; } beforeWord = false; List <Pullenti.Morph.MorphToken> mt = Pullenti.Morph.MorphologyService.Process(tmp.ToString(), null, null); if (mt == null || mt.Count != 1) { t = t1; continue; } foreach (Pullenti.Morph.MorphWordForm wf in mt[0].WordForms) { if (wf.IsInDictionary) { beforeWord = true; break; } } if (!beforeWord) { t = t1; continue; } tt = new Pullenti.Ner.TextToken(mt[0], this, t.BeginChar, t1.EndChar); if (t == FirstToken) { FirstToken = tt; } else { tt.Previous = t.Previous; } tt.Next = t1.Next; t = tt; } }
public AnalysisKit(Pullenti.Ner.SourceOfAnalysis sofa = null, bool onlyTokenizing = false, Pullenti.Morph.MorphLang lang = null, ProgressChangedEventHandler progress = null) { if (sofa == null) { return; } m_Sofa = sofa; StartDate = DateTime.Now; List <Pullenti.Morph.MorphToken> tokens = Pullenti.Morph.MorphologyService.Process(sofa.Text, lang, progress); Pullenti.Ner.Token t0 = null; if (tokens != null) { for (int ii = 0; ii < tokens.Count; ii++) { Pullenti.Morph.MorphToken mt = tokens[ii]; if (mt.BeginChar == 733860) { } Pullenti.Ner.TextToken tt = new Pullenti.Ner.TextToken(mt, this); if (sofa.CorrectionDict != null) { string corw; if (sofa.CorrectionDict.TryGetValue(mt.Term, out corw)) { List <Pullenti.Morph.MorphToken> ccc = Pullenti.Morph.MorphologyService.Process(corw, lang, null); if (ccc != null && ccc.Count == 1) { Pullenti.Ner.TextToken tt1 = new Pullenti.Ner.TextToken(ccc[0], this, tt.BeginChar, tt.EndChar) { Term0 = tt.Term }; tt1.Chars = tt.Chars; tt = tt1; if (CorrectedTokens == null) { CorrectedTokens = new Dictionary <Pullenti.Ner.Token, string>(); } CorrectedTokens.Add(tt, tt.GetSourceText()); } } } if (t0 == null) { FirstToken = tt; } else { t0.Next = tt; } t0 = tt; } } if (sofa.ClearDust) { this.ClearDust(); } if (sofa.DoWordsMergingByMorph) { this.CorrectWordsByMerging(lang); } if (sofa.DoWordCorrectionByMorph) { this.CorrectWordsByMorph(lang); } this.MergeLetters(); this.DefineBaseLanguage(); if (sofa.CreateNumberTokens) { for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { Pullenti.Ner.NumberToken nt = NumberHelper.TryParseNumber(t); if (nt == null) { continue; } this.EmbedToken(nt); t = nt; } } if (onlyTokenizing) { return; } for (Pullenti.Ner.Token t = FirstToken; t != null; t = t.Next) { if (t.Morph.Class.IsPreposition) { continue; } Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary(); if (mc.IsUndefined && t.Chars.IsCyrillicLetter && t.LengthChar > 4) { string tail = sofa.Text.Substring(t.EndChar - 1, 2); Pullenti.Ner.Token tte = null; Pullenti.Ner.Token tt = t.Previous; if (tt != null && ((tt.IsCommaAnd || tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction))) { tt = tt.Previous; } if ((tt != null && !tt.GetMorphClassInDictionary().IsUndefined&& ((tt.Morph.Class.Value & t.Morph.Class.Value)) != 0) && tt.LengthChar > 4) { string tail2 = sofa.Text.Substring(tt.EndChar - 1, 2); if (tail2 == tail) { tte = tt; } } if (tte == null) { tt = t.Next; if (tt != null && ((tt.IsCommaAnd || tt.Morph.Class.IsPreposition || tt.Morph.Class.IsConjunction))) { tt = tt.Next; } if ((tt != null && !tt.GetMorphClassInDictionary().IsUndefined&& ((tt.Morph.Class.Value & t.Morph.Class.Value)) != 0) && tt.LengthChar > 4) { string tail2 = sofa.Text.Substring(tt.EndChar - 1, 2); if (tail2 == tail) { tte = tt; } } } if (tte != null) { t.Morph.RemoveItemsEx(tte.Morph, tte.GetMorphClassInDictionary()); } } continue; } this.CreateStatistics(); }
public static string GetNameEx(Pullenti.Ner.Token begin, Pullenti.Ner.Token end, Pullenti.Morph.MorphClass cla, Pullenti.Morph.MorphCase mc, Pullenti.Morph.MorphGender gender = Pullenti.Morph.MorphGender.Undefined, bool ignoreBracketsAndHiphens = false, bool ignoreGeoReferent = false) { if (end == null || begin == null) { return(null); } if (begin.EndChar > end.BeginChar && begin != end) { return(null); } StringBuilder res = new StringBuilder(); string prefix = null; for (Pullenti.Ner.Token t = begin; t != null && t.EndChar <= end.EndChar; t = t.Next) { if (res.Length > 1000) { break; } if (t.IsTableControlChar) { continue; } if (ignoreBracketsAndHiphens) { if (BracketHelper.IsBracket(t, false)) { if (t == end) { break; } if (t.IsCharOf("(<[")) { BracketSequenceToken br = BracketHelper.TryParse(t, BracketParseAttr.No, 100); if (br != null && br.EndChar <= end.EndChar) { string tmp = GetNameEx(br.BeginToken.Next, br.EndToken.Previous, Pullenti.Morph.MorphClass.Undefined, Pullenti.Morph.MorphCase.Undefined, Pullenti.Morph.MorphGender.Undefined, ignoreBracketsAndHiphens, false); if (tmp != null) { if ((br.EndChar == end.EndChar && br.BeginToken.Next == br.EndToken.Previous && !br.BeginToken.Next.Chars.IsLetter) && !(br.BeginToken.Next is Pullenti.Ner.ReferentToken)) { } else { res.AppendFormat(" {0}{1}{2}", t.GetSourceText(), tmp, br.EndToken.GetSourceText()); } } t = br.EndToken; } } continue; } if (t.IsHiphen) { if (t == end) { break; } else if (t.IsWhitespaceBefore || t.IsWhitespaceAfter) { continue; } } } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt != null) { if (!ignoreBracketsAndHiphens) { if ((tt.Next != null && tt.Next.IsHiphen && (tt.Next.Next is Pullenti.Ner.TextToken)) && tt != end && tt.Next != end) { if (prefix == null) { prefix = tt.Term; } else { prefix = string.Format("{0}-{1}", prefix, tt.Term); } t = tt.Next; if (t == end) { break; } else { continue; } } } string s = null; if (cla.Value != 0 || !mc.IsUndefined || gender != Pullenti.Morph.MorphGender.Undefined) { foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items) { Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm; if (wf == null) { continue; } if (cla.Value != 0) { if (((wf.Class.Value & cla.Value)) == 0) { continue; } } if (!mc.IsUndefined) { if (((wf.Case & mc)).IsUndefined) { continue; } } if (gender != Pullenti.Morph.MorphGender.Undefined) { if (((wf.Gender & gender)) == Pullenti.Morph.MorphGender.Undefined) { continue; } } if (s == null || wf.NormalCase == tt.Term) { s = wf.NormalCase; } } if (s == null && gender != Pullenti.Morph.MorphGender.Undefined) { foreach (Pullenti.Morph.MorphBaseInfo wff in tt.Morph.Items) { Pullenti.Morph.MorphWordForm wf = wff as Pullenti.Morph.MorphWordForm; if (wf == null) { continue; } if (cla.Value != 0) { if (((wf.Class.Value & cla.Value)) == 0) { continue; } } if (!mc.IsUndefined) { if (((wf.Case & mc)).IsUndefined) { continue; } } if (s == null || wf.NormalCase == tt.Term) { s = wf.NormalCase; } } } } if (s == null) { s = tt.Term; if (tt.Chars.IsLastLower && tt.LengthChar > 2) { s = tt.GetSourceText(); for (int i = s.Length - 1; i >= 0; i--) { if (char.IsUpper(s[i])) { s = s.Substring(0, i + 1); break; } } } } if (prefix != null) { string delim = "-"; if (ignoreBracketsAndHiphens) { delim = " "; } s = string.Format("{0}{1}{2}", prefix, delim, s); } prefix = null; if (res.Length > 0 && s.Length > 0) { if (char.IsLetterOrDigit(s[0])) { char ch0 = res[res.Length - 1]; if (ch0 == '-') { } else { res.Append(' '); } } else if (!ignoreBracketsAndHiphens && BracketHelper.CanBeStartOfSequence(tt, false, false)) { res.Append(' '); } } res.Append(s); } else if (t is Pullenti.Ner.NumberToken) { if (res.Length > 0) { if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-') { } else { res.Append(' '); } } Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; if ((t.Morph.Class.IsAdjective && nt.Typ == Pullenti.Ner.NumberSpellingType.Words && nt.BeginToken == nt.EndToken) && (nt.BeginToken is Pullenti.Ner.TextToken)) { res.Append((nt.BeginToken as Pullenti.Ner.TextToken).Term); } else { res.Append(nt.Value); } } else if (t is Pullenti.Ner.MetaToken) { if ((ignoreGeoReferent && t != begin && t.GetReferent() != null) && t.GetReferent().TypeName == "GEO") { continue; } string s = GetNameEx((t as Pullenti.Ner.MetaToken).BeginToken, (t as Pullenti.Ner.MetaToken).EndToken, cla, mc, gender, ignoreBracketsAndHiphens, ignoreGeoReferent); if (!string.IsNullOrEmpty(s)) { if (res.Length > 0) { if (!t.IsWhitespaceBefore && res[res.Length - 1] == '-') { } else { res.Append(' '); } } res.Append(s); } } if (t == end) { break; } } if (res.Length == 0) { return(null); } return(res.ToString()); }