/* * /// <summary> * /// Osnovna obrada na osnovu jezika> da li je poznata rec u pitanju ili nije - * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguageBasic(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * if (language.isKnownWord(token.content)) * { * token.genericType = nlpTokenGenericType.knownWord; * } * break; * case nlpTokenGenericType.number: * break; * * } * * } */ /* * /// <summary> * /// FAZA 3: Dodatna obrada tokena na osnovu jezickih podesavanja -- nije jos implementirano!!! * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguage(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * * //if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) * //{ * // token.genericType = nlpTokenGenericType.knownWord; * //} * * // token.wordVariations = languageTools.test<List<string>>(language, token.content, languageModelOperation.getVariations) as List<string>; * // List<string> stems = languageTools.test<List<string>>(language, token.content, languageModelOperation.getStems) as List<string>; * //token.wordRoot = imbStringOperations.longestCommonSubstring(token.wordVariations); * * //token.wordRoot = stems[0]; * break; * case nlpTokenGenericType.number: * * * break; * * } * * } */ /// <summary> /// FAZA 2: podesava letter case, proverava jezik, proverava da li je mozda akronim - funkcionise samo ako su detektovani slogovi /// </summary> /// <param name="token"></param> /// <param name="language"></param> /// <returns></returns> private static nlpTokenGenericType findGenericTypeSecond(IContentToken token, basicLanguage language) { nlpTokenGenericType output = token.genericType; object testOut; /* * * if (token.tokenBaseType == nlpTokenBaseType.word) * { * token.letterCase = nlpTextCase.unknown; * if (tokenization.wordWithCapitalStart.IsMatch(token.content)) token.letterCase = nlpTextCase.firstUpperRestLower; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToLower() == token.content) token.letterCase = nlpTextCase.lowerCase; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToUpper() == token.content) token.letterCase = nlpTextCase.upperCase; * if (token.letterCase == nlpTextCase.unknown) token.letterCase = nlpTextCase.mixedCase; * } */ if (token.flags == contentTokenFlag.languageWord) { if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) { token.flags = token.flags.Add(contentTokenFlag.languageKnownWord); output = nlpTokenGenericType.knownWord; } else { if (token.flags.getEnumListFromFlags().ContainsOneOrMore(contentTokenFlag.acronim, contentTokenFlag.acronimDiscovered, contentTokenFlag.acronimKnown)) { output = nlpTokenGenericType.wordAbrevation; } else { if (token.flags.HasFlag(contentTokenFlag.caseAllUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } else { token.flags = token.flags.Add(contentTokenFlag.titleOneWord); } } else if (token.flags.HasFlag(contentTokenFlag.caseFirstUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } else { if (!token.isFirst) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } } } } } } token.genericType = output; return(output); }
public override void secondaryFlaging(params object[] resources) { IContentToken _sub = parent as IContentToken; if (_sub != null) { if (_sub.detectionFlags.HasFlag(tokenDetectionFlag.cityAndPostnumberSubSentences)) { flags.Add(contentTokenFlag.cityName); } if (_sub.detectionFlags.HasFlag(tokenDetectionFlag.potentialPersonalNamesSubSentences)) { flags.Add(contentTokenFlag.personalNameOrLastname); } } if (page != null) { bool checkHeads = false; if (flags.ContainsAll(contentTokenFlag.caseAllUpper, contentTokenFlag.languageWord)) { checkHeads = true; } else if (flags.ContainsAll(contentTokenFlag.caseFirstUpper, contentTokenFlag.languageWord, contentTokenFlag.languageUnknownWord)) { checkHeads = true; } else if (flags.HasFlag(contentTokenFlag.namedEntity)) { checkHeads = true; } if (checkHeads) { foreach (contentToken tkn in page.headTokens) { if (content.ToLower() == tkn.content) { if (tkn.origin == contentTokenOrigin.domain) { flags = flags.Add(contentTokenFlag.namedEntity); flags = flags.Add(contentTokenFlag.namedEntityDiscovered); } if (tkn.origin == contentTokenOrigin.title) { if (flags.HasFlag(contentTokenFlag.languageUnknownWord)) { flags = flags.Add(contentTokenFlag.namedEntity); } else { flags = flags.Add(contentTokenFlag.title); } } } } } } //if (this.items.Query<contentTokenFlags>(enums.contentRelationQueryType.gatherFlags, enums.contentRelationType.manyNext, this, 2).Contains(contentTokenFlag.acronim)) if (ContainsOneOrMore(contentRelationType.manyNext, 2, contentTokenFlag.acronim, contentTokenFlag.acronimKnown)) { if (flags.HasFlag(contentTokenFlag.namedEntity)) { flags = flags.Add(contentTokenFlag.namedEntityDiscovered); } else { } } //} baseType = nlpTokenBaseType.unknown; var fl = flags.getEnumListFromFlags(); if (fl.ContainsOneOrMore(contentTokenFlag.languageWord, contentTokenFlag.title, contentTokenFlag.namedEntity, contentTokenFlag.languageKnownWord)) { baseType = nlpTokenBaseType.word; } if (fl.ContainsOneOrMore(contentTokenFlag.number, contentTokenFlag.numberFormatted, contentTokenFlag.yearNumber, contentTokenFlag.zipCodeNumber, contentTokenFlag.internationalStandard)) { baseType = nlpTokenBaseType.number; } if (fl.ContainsOneOrMore(contentTokenFlag.numberFormatted)) { baseType = nlpTokenBaseType.mixed; } items.ForEach(x => x.secondaryFlaging(resources)); // throw new NotImplementedException(); }
/// <summary> /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence /// </summary> /// <typeparam name="T"></typeparam> /// <typeparam name="TS"></typeparam> /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param> /// <returns></returns> public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources) where T : class, IContentToken, new() where TS : IContentSubSentence, new() { //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification); IContentSentence sentence = this; tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(); contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>(); if (tokenCollections == null) { tokenCollections = new contentTokenCollection(); } contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null); try { int subCount = 0; for (int dti = 0; dti < subsentenceMatches.Count; dti++) { contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti]; contentSubSentence ss = dt.element as contentSubSentence; contentTokenCollection subtkns = new contentTokenCollection(); //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags); var cs = ss._setTokensForSentence <T>(subtkns, detection_flags); //var cs = ss._setTokensForSentence<T>(tokenCollections, flags); //var cs = tokenCollectionsss._set //var cs = ss._setTokensForSentence<T>(flags); for (int ci = 0; ci < cs.Count; ci++) { ss.setItem(cs[ci]); } //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches); // ss.items.AddRange(cs); // contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources); //ss.items.Add(ss); //foreach (T sst in ss.items) //{ // tokenCollections.Add(sst); //} //tokenCollections.Add(ss); //dt.element = ss; // subCount++; } List <IContentToken> directTokens = new List <IContentToken>(); directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens); if (directTokens != tokenCollections) { for (int dti = 0; dti < directTokens.Count; dti++) { IContentToken dt = directTokens[dti]; T tkn = dt as T; if (tkn != null) { tokenCollections.Add(tkn); } } } } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("tokenDetection error"); isb.AppendLine("Target is: " + sentence.toStringSafe()); throw; // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization); } foreach (var tk in tokenCollections) { //subsentenceMatches.allocated(tk.) setItem(tk); } // logSystem.log("set tokens from content Sentence done", logType.Notification); return(tokenCollections); }
/// <summary> /// FAZA 1: Osnovni nivo detekcije generickog tipa - koristi niz REGEX testova da bi utvrdio o kakvom se tokenu radi, da li je rec ili nije rec. Ako je rec cisti content tako da ostane samo rec. /// </summary> /// <param name="content"></param> /// <returns></returns> private static nlpTokenGenericType findGenericTypeBasic(IContentToken source) { Regex reg = null; nlpTokenGenericType output = nlpTokenGenericType.unknown; if (string.IsNullOrEmpty(source.content)) { output = nlpTokenGenericType.empty; return(output); } if (string.IsNullOrWhiteSpace(source.content)) { output = nlpTokenGenericType.empty; return(output); } if (tokenization.numericSelect.IsMatch(source.content)) { // ima brojeva if (tokenization.numberOrdinal.IsMatch(source.sourceContent)) { output = nlpTokenGenericType.numberOrdinal; } else { if (tokenization.numbersFormatedExpr.IsMatch(source.sourceContent)) { output = nlpTokenGenericType.numberFormated; } else { if (tokenization.lettersSelect.IsMatch(source.content)) { output = nlpTokenGenericType.mixedAlfanumeric; } else { output = nlpTokenGenericType.number; } } } } else { if (tokenization.lettersSelect.IsMatch(source.content)) { // ima slova Match flw = tokenization.firstLetterWord.Match(source.content); if (flw.Success) { output = nlpTokenGenericType.unknownWord; if (source.content.Contains('@')) { if (tokenization.emailExpr.IsMatch(source.content)) { output = nlpTokenGenericType.email; } } } else { if (tokenization.selectPunctation.IsMatch(source.content)) { output = nlpTokenGenericType.mixedAlfasymbolic; // nema brojeva } else { output = nlpTokenGenericType.unknownWord; } } } else { if (tokenization.selectPunctation.IsMatch(source.content)) { output = nlpTokenGenericType.symbols; // nema brojeva } else { output = nlpTokenGenericType.unknown; } } } if (genericToBaseType(output) == nlpTokenBaseType.word) { string clean = tokenization.samoRec.Match(source.content).Value; source.content = clean; source.spliter = source.sourceContent.Replace(clean, ""); } return(output); }