/// <summary> /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena /// </summary> /// <param name="content"></param> /// <param name="sentence"></param> /// <param name="language"></param> /// <returns></returns> internal List <T> setTokensFromContent <T>(string content, IContentSentence sentence) where T : IContentToken, new() { List <T> output = new List <T>(); MatchCollection coll = tokenization.tokenSelectForWordsAndPunctation.Matches(content); //String[] rawTokens = wordSpliter.Split(content); foreach (Match pmt in coll) { int lastIndex = 0; foreach (Capture cp in pmt.Captures) { string tkn = cp.Value; if (string.IsNullOrEmpty(tkn)) { break; } tkn = tkn.Trim(); if (string.IsNullOrEmpty(tkn)) { break; } //cp.Index T newToken = new T(); //(cp.Value, sentence, "", cp.Value); newToken.content = cp.Value; newToken.parent = sentence; newToken.spliter = ""; newToken.sourceContent = cp.Value; if (sentence != null) { sentence.setItem(newToken); } output.Add(newToken); } } return(output); }
/// <summary> /// Pravi tokene za prosledjenu recenicu /// </summary> /// <typeparam name="T"></typeparam> /// <param name="sentence"></param> /// <param name="setTokenToSentence"></param> /// <param name="flags"></param> /// <param name="subsentenceMatches"></param> /// <returns></returns> private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new() { contentMatchCollection macroTokens = new contentMatchCollection(); List <IContentToken> output = new List <IContentToken>(); String scrambled = sentence.content; var flags = flag.getEnumListFromFlags(); macroTokens.scrambled = sentence.content; foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.emailAddress: macroTokens.Add(_select_emailAddress, fl); break; case tokenDetectionFlag.phonenumber: macroTokens.Add(_select_phoneNumber, fl); break; case tokenDetectionFlag.standard: macroTokens.Add(_select_standards, fl); break; } } foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.yearNumber: macroTokens.Add(_select_yearNumber, fl); break; case tokenDetectionFlag.postOfficeNumber: macroTokens.Add(_select_phoneNumber, fl); break; } } foreach (tokenDetectionFlag fl in flags) { switch (fl) { case tokenDetectionFlag.acronims: macroTokens.Add(_select_acronimIrregular, fl); macroTokens.Add(_select_acronimByLength, fl); break; } } // logSystem.log(" -- setTokensForSentence: quering performed "+ macroTokens.Count , logType.Notification); if (flags.Contains(tokenDetectionFlag.standardDetection)) { macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none); } Int32 i = 0; Int32 mx = sentence.content.Length; while (i < mx) { try { #region LOOP oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1); if (cms == null) { i = mx; continue; } if (cms.isNothing) { i++; continue; } else { contentMatch cm = cms.First(); // if (cm == null) { // logSystem.log(" -- -- -- cm is null " + cm.toStringSafe(), logType.Notification); i++; continue; } else { // logSystem.log(" -- -- -- cm found " + cm.toStringSafe(), logType.Notification); } i = i + cm.match.Length; T newToken = new T(); String mch = cm.match.Value.Trim("#".ToCharArray()); newToken.sourceContent = mch; newToken.content = mch; Match sp = _select_tokenWithSplitter.Match(mch); if (sp.Success) { newToken.spliter = sp.Value; newToken.content = newToken.content.removeEndsWith(newToken.spliter); } else { newToken.spliter = ""; newToken.content = mch; } IContentSentence _sentence = sentence; if (setTokenToSentence) { if (subsentenceMatches != null) { if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length)) { oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length); contentMatch subcm = subcms.First(); if (subcm == null) { // logSystem.log(" -- -- -- sub cm is null ", logType.Notification); } else { _sentence = subcm.element as IContentSubSentence; } } else { } } else { } if (_sentence != null) { _sentence.setItem(newToken); if (_sentence == sentence) { output.Add(newToken); } } else { logSystem.log(" -- -- -- _sentence is null ", logType.Notification); } } else { output.Add(newToken); } if (cm.associatedKey == null) { logSystem.log(" -- -- -- cm.associatedKey is null ", logType.Notification); } else { tokenDetectionFlag fl = tokenDetectionFlag.none; Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl); newToken.detectionFlags = fl; } cm.element = newToken; } #endregion LOOP } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("loop error error"); isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]"); //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization); } } return(output); }