static void _processBlock(Pullenti.Semantic.SemDocument res, Pullenti.Ner.AnalysisResult ar, Pullenti.Ner.Token t0, Pullenti.Ner.Token t1) { Pullenti.Semantic.SemBlock blk = new Pullenti.Semantic.SemBlock(res); for (Pullenti.Ner.Token t = t0; t != null && t.EndChar <= t1.EndChar; t = t.Next) { Pullenti.Ner.Token te = t; for (Pullenti.Ner.Token tt = t.Next; tt != null && tt.EndChar <= t1.EndChar; tt = tt.Next) { if (Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(tt)) { break; } else { te = tt; } } _processSentence(blk, ar, t, te); t = te; } if (blk.Fragments.Count > 0) { res.Blocks.Add(blk); } }
internal void InitFrom(Pullenti.Ner.AnalysisResult ar) { m_Sofa = ar.Sofa; FirstToken = ar.FirstToken; BaseLanguage = ar.BaseLanguage; this.CreateStatistics(); }
static void _processSentence(Pullenti.Semantic.SemBlock blk, Pullenti.Ner.AnalysisResult ar, Pullenti.Ner.Token t0, Pullenti.Ner.Token t1) { int cou = 0; for (Pullenti.Ner.Token t = t0; t != null && (t.EndChar < t1.EndChar); t = t.Next, cou++) { } if (cou > 70) { int cou2 = 0; for (Pullenti.Ner.Token t = t0; t != null && (t.EndChar < t1.EndChar); t = t.Next, cou2++) { if (cou2 >= 70) { t1 = t; break; } } } List <Sentence> sents = Sentence.ParseVariants(t0, t1, 0, 100, SentItemType.Undefined); if (sents == null) { return; } double max = (double)-1; Sentence best = null; Sentence alt = null; foreach (Sentence s in sents) { if ((t1 is Pullenti.Ner.TextToken) && !t1.Chars.IsLetter) { s.LastChar = t1 as Pullenti.Ner.TextToken; } s.CalcCoef(false); if (s.Coef > max) { max = s.Coef; best = s; alt = null; } else if (s.Coef == max && max > 0) { alt = s; } } if (best != null && best.ResBlock != null) { best.AddToBlock(blk, null); } }
/// <summary> /// Сделать семантический анализ поверх результатов морфологического анализа и NER /// </summary> /// <param name="ar">результат обработки Processor</param> /// <param name="pars">дополнительные параметры</param> /// <return>результат анализа текста</return> public static SemDocument Process(Pullenti.Ner.AnalysisResult ar, SemProcessParams pars = null) { return(Pullenti.Semantic.Internal.AnalyzeHelper.Process(ar, pars ?? new SemProcessParams())); }
public static void Main(string[] args) { Stopwatch sw = Stopwatch.StartNew(); // инициализация - необходимо проводить один раз до обработки текстов Console.Write("Initializing SDK Pullenti ver {0} ({1}) ... ", Pullenti.Sdk.Version, Pullenti.Sdk.VersionDate); // инициализируются движок и все имеющиеся анализаторы Pullenti.Sdk.InitializeAll(); sw.Stop(); Console.WriteLine("OK (by {0} ms), version {1}", (int)sw.ElapsedMilliseconds, Pullenti.Ner.ProcessorService.Version); // посмотрим, какие анализаторы доступны foreach (Pullenti.Ner.Analyzer a in Pullenti.Ner.ProcessorService.Analyzers) { Console.WriteLine(" {0} {1} \"{2}\"", (a.IsSpecific ? "Specific analyzer" : "Common analyzer"), a.Name, a.Caption); } // анализируемый текст string txt = "Система разрабатывается с 2011 года российским программистом Михаилом Жуковым, проживающим в Москве на Красной площади в доме номер один на втором этаже. Конкурентов у него много: Abbyy, Yandex, ООО \"Russian Context Optimizer\" (RCO) и другие компании. Он планирует продать SDK за 1.120.000.001,99 (миллиард сто двадцать миллионов один рубль 99 копеек) рублей, без НДС."; Console.WriteLine("Text: {0}", txt); // запускаем обработку на пустом процессоре (без анализаторов NER) Pullenti.Ner.AnalysisResult are = Pullenti.Ner.ProcessorService.EmptyProcessor.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); Console.Write("Noun groups: "); // перебираем токены for (Pullenti.Ner.Token t = are.FirstToken; t != null; t = t.Next) { // выделяем именную группу с текущего токена Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null); // не получилось if (npt == null) { continue; } // получилось, выводим в нормализованном виде Console.Write("[{0}=>{1}] ", npt.GetSourceText(), npt.GetNormalCaseText(null, Pullenti.Morph.MorphNumber.Singular, Pullenti.Morph.MorphGender.Undefined, false)); // указатель на последний токен именной группы t = npt.EndToken; } using (Pullenti.Ner.Processor proc = Pullenti.Ner.ProcessorService.CreateProcessor()) { // анализируем текст Pullenti.Ner.AnalysisResult ar = proc.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); // результирующие сущности Console.WriteLine("\r\n==========================================\r\nEntities: "); foreach (Pullenti.Ner.Referent e in ar.Entities) { Console.WriteLine("{0}: {1}", e.TypeName, e.ToString()); foreach (Pullenti.Ner.Slot s in e.Slots) { Console.WriteLine(" {0}: {1}", s.TypeName, s.Value); } } // пример выделения именных групп Console.WriteLine("\r\n==========================================\r\nNoun groups: "); for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { // токены с сущностями игнорируем if (t.GetReferent() != null) { continue; } // пробуем создать именную группу Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.AdjectiveCanBeLast, 0, null); // не получилось if (npt == null) { continue; } Console.WriteLine(npt); // указатель перемещаем на последний токен группы t = npt.EndToken; } } using (Pullenti.Ner.Processor proc = Pullenti.Ner.ProcessorService.CreateSpecificProcessor(Pullenti.Ner.Keyword.KeywordAnalyzer.ANALYZER_NAME)) { Pullenti.Ner.AnalysisResult ar = proc.Process(new Pullenti.Ner.SourceOfAnalysis(txt), null, null); Console.WriteLine("\r\n==========================================\r\nKeywords1: "); foreach (Pullenti.Ner.Referent e in ar.Entities) { if (e is Pullenti.Ner.Keyword.KeywordReferent) { Console.WriteLine(e); } } Console.WriteLine("\r\n==========================================\r\nKeywords2: "); for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { if (t is Pullenti.Ner.ReferentToken) { Pullenti.Ner.Keyword.KeywordReferent kw = t.GetReferent() as Pullenti.Ner.Keyword.KeywordReferent; if (kw == null) { continue; } string kwstr = Pullenti.Ner.Core.MiscHelper.GetTextValueOfMetaToken(t as Pullenti.Ner.ReferentToken, Pullenti.Ner.Core.GetTextAttr.FirstNounGroupToNominativeSingle | Pullenti.Ner.Core.GetTextAttr.KeepRegister); Console.WriteLine("{0} = {1}", kwstr, kw); } } } Console.WriteLine("Over!"); }
public static void Initialize() { if (GlobalOrgs != null) { return; } GlobalOrgs = new Pullenti.Ner.Core.IntOntologyCollection(); Pullenti.Ner.Org.OrganizationReferent org; Pullenti.Ner.Core.IntOntologyItem oi; using (Pullenti.Ner.Processor geoProc = Pullenti.Ner.ProcessorService.CreateEmptyProcessor()) { geoProc.AddAnalyzer(new Pullenti.Ner.Geo.GeoAnalyzer()); Dictionary <string, Pullenti.Ner.Geo.GeoReferent> geos = new Dictionary <string, Pullenti.Ner.Geo.GeoReferent>(); for (int k = 0; k < 3; k++) { Pullenti.Morph.MorphLang lang = (k == 0 ? Pullenti.Morph.MorphLang.RU : (k == 1 ? Pullenti.Morph.MorphLang.EN : Pullenti.Morph.MorphLang.UA)); string name = (k == 0 ? "Orgs_ru.dat" : (k == 1 ? "Orgs_en.dat" : "Orgs_ua.dat")); byte[] dat = ResourceHelper.GetBytes(name); if (dat == null) { throw new Exception(string.Format("Can't file resource file {0} in Organization analyzer", name)); } using (MemoryStream tmp = new MemoryStream(OrgItemTypeToken.Deflate(dat))) { tmp.Position = 0; XmlDocument xml = new XmlDocument(); xml.Load(tmp); foreach (XmlNode x in xml.DocumentElement.ChildNodes) { org = new Pullenti.Ner.Org.OrganizationReferent(); string abbr = null; foreach (XmlNode xx in x.ChildNodes) { if (xx.LocalName == "typ") { org.AddSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_TYPE, xx.InnerText, false, 0); } else if (xx.LocalName == "nam") { org.AddSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_NAME, xx.InnerText, false, 0); } else if (xx.LocalName == "epo") { org.AddSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_EPONYM, xx.InnerText, false, 0); } else if (xx.LocalName == "prof") { org.AddSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_PROFILE, xx.InnerText, false, 0); } else if (xx.LocalName == "abbr") { abbr = xx.InnerText; } else if (xx.LocalName == "geo") { Pullenti.Ner.Geo.GeoReferent geo; if (!geos.TryGetValue(xx.InnerText, out geo)) { Pullenti.Ner.AnalysisResult ar = geoProc.Process(new Pullenti.Ner.SourceOfAnalysis(xx.InnerText), null, lang); if (ar != null && ar.Entities.Count == 1 && (ar.Entities[0] is Pullenti.Ner.Geo.GeoReferent)) { geo = ar.Entities[0] as Pullenti.Ner.Geo.GeoReferent; geos.Add(xx.InnerText, geo); } else { } } if (geo != null) { org.AddSlot(Pullenti.Ner.Org.OrganizationReferent.ATTR_GEO, geo, false, 0); } } } oi = org.CreateOntologyItemEx(2, true, true); if (oi == null) { continue; } if (abbr != null) { oi.Termins.Add(new Pullenti.Ner.Core.Termin(abbr, null, true)); } if (k == 2) { GlobalOrgsUa.AddItem(oi); } else { GlobalOrgs.AddItem(oi); } } } } } return; }
public static Pullenti.Semantic.SemDocument Process(Pullenti.Ner.AnalysisResult ar, Pullenti.Semantic.SemProcessParams pars) { Pullenti.Semantic.SemDocument txt = new Pullenti.Semantic.SemDocument(); for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { t.Tag = null; } if (pars.Progress != null) { pars.Progress(null, new ProgressChangedEventArgs(0, null)) /* error */; } int pers0 = 0; for (Pullenti.Ner.Token t = ar.FirstToken; t != null; t = t.Next) { if (pars.Progress != null) { int p = t.BeginChar; if (ar.Sofa.Text.Length < 100000) { p = (p * 100) / ar.Sofa.Text.Length; } else { p /= ((ar.Sofa.Text.Length / 100)); } if (p != pers0) { pers0 = p; pars.Progress(null, new ProgressChangedEventArgs(p, null)) /* error */; } } Pullenti.Ner.Token t1 = t; for (Pullenti.Ner.Token tt = t.Next; tt != null; tt = tt.Next) { if (tt.IsNewlineBefore) { if (Pullenti.Ner.Core.MiscHelper.CanBeStartOfSentence(tt)) { break; } } t1 = tt; } try { _processBlock(txt, ar, t, t1); } catch (Exception ex) { } t = t1; if (pars.MaxChar > 0 && t.EndChar > pars.MaxChar) { break; } } OptimizerHelper.Optimize(txt, pars); if (pars.Progress != null) { pars.Progress(null, new ProgressChangedEventArgs(100, null)) /* error */; } return(txt); }