/// <summary> /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent /// </summary> /// <typeparam name="T"></typeparam> /// <param name="resources"></param> /// <returns></returns> public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources) where T : IContentToken, new() { contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>(); if (subsentenceMatches == null) { subsentenceMatches = new contentMatchCollection(); } contentMatchCollection macroTokens = null; if (macroTokens == null) { macroTokens = _setMacroTokensForSentence <T>(resources); } contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>(); tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>(); if (flags.HasFlag(tokenDetectionFlag.standardDetection)) { macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection); } string source = ""; source = content; int i = 0; int mx = source.Length; int sI = 0; int sLimit = mx; DateTime processStart = DateTime.Now; while (i < mx) { try { if (sI > sLimit) { aceLog.log("Content sentence tokenization broken"); break; } sI++; #region LOOP oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1); if (cms == null) { i = mx; continue; } if (cms.isNothing) { i++; continue; } else { contentMatch cm = cms.First(); if (cm == null) { i++; continue; } i = i + cm.match.Length; IContentToken newToken = new T(); string mch = cm.match.Value.Trim("#".ToCharArray()); newToken.sourceContent = mch; newToken.content = mch; Match sp = _select_tokenSplitter.Match(mch); if (sp.Success) { newToken.spliter = sp.Value; newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim(); } else { //if (cm.match.Groups.Count > 1) //{ // mch = cm.match.Groups[1].Value; //} //else //{ //} newToken.spliter = ""; newToken.content = mch.Trim(); } if (DateTime.Now.Subtract(processStart).Minutes > 2) { aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!"); break; } IContentSentence _sentence = this; if (cm.element is IContentSubSentence) { IContentSubSentence sub = cm.element as IContentSubSentence; sub.masterToken = newToken; newToken = (IContentToken)cm.element; } /* * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length)) * { * oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length); * contentMatch subcm = subcms.First(); * if (subcm == null) * { * // logSystem.log(" -- -- -- sub cm is null ", logType.Notification); * } * else * { * if (subcm.element != null) * { * * } * _sentence = subcm.element as IContentSubSentence; * * if (_sentence != null) * { * IContentSubSentence _subSentence = _sentence as IContentSubSentence; * newToken.flags.Add(contentTokenFlag.subsentence_inner); * _subSentence.masterToken = newToken; * newToken = (T)(_subSentence as IContentToken); * * * * * //_sentence.setItem(newToken); * * //if (output.Contains(_sentence as IContentToken)) * //{ * * //} * //else * //{ * * // output.Add(_sentence as IContentToken); * //} * * } * else * { * // output.Add(newToken); * } * } * * * }*/ /* * if (_sentence != null) * { * * * //setItem(_sentence); * * * if (_sentence == this) * { * * output.Add(newToken); * } else * { * setItem(newToken); * if (output.Contains(_sentence as IContentToken)) * { * * } * else * { * // output.Add(_sentence as IContentToken); * } * * } * * } * else * { * setItem(newToken); * } */ if (cm.associatedKey != null) { tokenDetectionFlag fl = tokenDetectionFlag.none; bool detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl); newToken.detectionFlags = fl; } if (output.Contains(newToken)) { } else { if (newToken == this) { } else { output.Add(newToken); } } } #endregion } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("loop error error"); isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]"); throw new aceGeneralException(isb.ToString(), null, this, "Loop"); // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization); } } return(output); }
/// <summary> /// Pravi tokene za prosledjenu recenicu /// </summary> /// <typeparam name="T"></typeparam> /// <param name="sentence"></param> /// <param name="setTokenToSentence"></param> /// <param name="flags"></param> /// <param name="subsentenceMatches"></param> /// <returns></returns> private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new() { contentMatchCollection macroTokens = new contentMatchCollection(); List <IContentToken> output = new List <IContentToken>(); String scrambled = sentence.content; var flags = flag.getEnumListFromFlags(); macroTokens.scrambled = sentence.content; foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.emailAddress: macroTokens.Add(_select_emailAddress, fl); break; case tokenDetectionFlag.phonenumber: macroTokens.Add(_select_phoneNumber, fl); break; case tokenDetectionFlag.standard: macroTokens.Add(_select_standards, fl); break; } } foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.yearNumber: macroTokens.Add(_select_yearNumber, fl); break; case tokenDetectionFlag.postOfficeNumber: macroTokens.Add(_select_phoneNumber, fl); break; } } foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.acronims: macroTokens.Add(_select_acronimIrregular, fl); macroTokens.Add(_select_acronimByLength, fl); break; } } // logSystem.log(" -- setTokensForSentence: quering performed "+ macroTokens.Count , logType.Notification); if (flags.Contains(tokenDetectionFlag.standardDetection)) { macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none); } Int32 i = 0; Int32 mx = sentence.content.Length; while (i < mx) { try { #region LOOP oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1); if (cms == null) { i = mx; continue; } if (cms.isNothing) { i++; continue; } else { contentMatch cm = cms.First(); // if (cm == null) { // logSystem.log(" -- -- -- cm is null " + cm.toStringSafe(), logType.Notification); i++; continue; } else { // logSystem.log(" -- -- -- cm found " + cm.toStringSafe(), logType.Notification); } i = i + cm.match.Length; T newToken = new T(); String mch = cm.match.Value.Trim("#".ToCharArray()); newToken.sourceContent = mch; newToken.content = mch; Match sp = _select_tokenWithSplitter.Match(mch); if (sp.Success) { newToken.spliter = sp.Value; newToken.content = newToken.content.removeEndsWith(newToken.spliter); } else { newToken.spliter = ""; newToken.content = mch; } IContentSentence _sentence = sentence; if (setTokenToSentence) { if (subsentenceMatches != null) { if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length)) { oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length); contentMatch subcm = subcms.First(); if (subcm == null) { // logSystem.log(" -- -- -- sub cm is null ", logType.Notification); } else { _sentence = subcm.element as IContentSubSentence; } } else { } } else { } if (_sentence != null) { _sentence.setItem(newToken); if (_sentence == sentence) { output.Add(newToken); } } else { logSystem.log(" -- -- -- _sentence is null ", logType.Notification); } } else { output.Add(newToken); } if (cm.associatedKey == null) { logSystem.log(" -- -- -- cm.associatedKey is null ", logType.Notification); } else { tokenDetectionFlag fl = tokenDetectionFlag.none; Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl); newToken.detectionFlags = fl; } cm.element = newToken; } #endregion LOOP } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("loop error error"); isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]"); //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization); } } return(output); }
/// <summary> /// Vraca pod recenice za prosledjenu recenicu. sentence.content ce dobiti skremblovanu verziju - gde je izbaceno sve sto nije /// </summary> /// <typeparam name="T"></typeparam> /// <param name="content"></param> /// <param name="sentence"></param> /// <param name="page"></param> /// <param name="_subflags"></param> /// <returns></returns> public static contentMatchCollection setSubSentences <T>(IContentSentence sentence, subsentenceDetectionFlag _subflags) where T : IContentSubSentence, new() { // List<T> output = new List<T>(); // logSystem.log("-- set sub sentences for: " + sentence.content, logType.Notification); contentMatchCollection subsentenceMatches = new contentMatchCollection(); String scrambled = sentence.content; subsentenceMatches.scrambled = scrambled; var subflags = _subflags.getEnumListFromFlags(); foreach (subsentenceDetectionFlag fl in subflags) { switch (fl) { case subsentenceDetectionFlag.enbracedSubSentences: subsentenceMatches.Add(_select_enbracedSubSentence, fl); break; case subsentenceDetectionFlag.enumerationSubSentences: subsentenceMatches.Add(_select_enumerationSubSentence, fl); break; case subsentenceDetectionFlag.quotationSubSentences: subsentenceMatches.Add(_select_quotedSubSentence, fl); break; } } //foreach (subsentenceDetectionFlag fl in subflags) //{ // switch (fl) // { // case subsentenceDetectionFlag.potentialPersonalNames: // subsentenceMatches.Add(_select_potentialPersonalNames, fl); // break; // case subsentenceDetectionFlag.cityAndPostnumber: // subsentenceMatches.Add(_select_potentialCityAndPost, fl); // break; // } //} foreach (subsentenceDetectionFlag fl in subflags) { switch (fl) { case subsentenceDetectionFlag.punctationSubSentences: subsentenceMatches.Add(_select_innerSentence, fl); break; } } foreach (contentMatch cm in subsentenceMatches.Values) { T subsentence = new T(); subsentence.parent = sentence; subsentence.sourceContent = cm.match.Value; subsentence.content = cm.match.Value; //subsentence.detectionFlags.Add((subsentenceDetectionFlag)cm.associatedKey); //switch ((subsentenceDetectionFlag) cm.associatedKey) //{ // case subsentenceDetectionFlag.enbracedSubSentences: // subsentence.flags.Add(contentTokenFlag.subsentence_inner); // break; // case subsentenceDetectionFlag.enumerationSubSentences: // subsentence.flags.Add(contentTokenFlag.subsentence_enumeration); // break; // case subsentenceDetectionFlag.quotationSubSentences: // subsentence.flags.Add(contentTokenFlag.subsentence_quoted); // break; // case subsentenceDetectionFlag.cityAndPostnumber: // subsentence.flags.Add(contentTokenFlag.subsentence_information); // break; // case subsentenceDetectionFlag.punctationSubSentences: // subsentence.flags.Add(contentTokenFlag.subsentence_inner); // break; // case subsentenceDetectionFlag.potentialPersonalNames: // subsentence.flags.Add(contentTokenFlag.subsentence_information); // break; //} cm.element = subsentence; } sentence.content = scrambled; // logSystem.log("-- set sub sentences done: ", logType.Notification); return(subsentenceMatches); }