/// <summary> /// Jednostavni konvertor izmedju base tipova i generic type /// </summary> /// <param name="input"></param> /// <returns></returns> public static nlpTokenBaseType genericToBaseType(nlpTokenGenericType input) { switch (input) { case nlpTokenGenericType.email: case nlpTokenGenericType.mixedAlfanumeric: case nlpTokenGenericType.mixedAlfasymbolic: return(nlpTokenBaseType.mixed); break; case nlpTokenGenericType.unknownWord: case nlpTokenGenericType.knownWord: case nlpTokenGenericType.possibleName: case nlpTokenGenericType.possibleAcronim: case nlpTokenGenericType.wordAbrevation: return(nlpTokenBaseType.word); break; case nlpTokenGenericType.number: case nlpTokenGenericType.numberFormated: case nlpTokenGenericType.numberOrdinal: return(nlpTokenBaseType.number); break; default: return(nlpTokenBaseType.unknown); break; } }
/// <summary> /// FAZA 1: Osnovni nivo detekcije generickog tipa - koristi niz REGEX testova da bi utvrdio o kakvom se tokenu radi, da li je rec ili nije rec. Ako je rec cisti content tako da ostane samo rec. /// </summary> /// <param name="content"></param> /// <returns></returns> private static nlpTokenGenericType findGenericTypeBasic(IContentToken source) { Regex reg = null; nlpTokenGenericType output = nlpTokenGenericType.unknown; if (string.IsNullOrEmpty(source.content)) { output = nlpTokenGenericType.empty; return(output); } if (string.IsNullOrWhiteSpace(source.content)) { output = nlpTokenGenericType.empty; return(output); } if (tokenization.numericSelect.IsMatch(source.content)) { // ima brojeva if (tokenization.numberOrdinal.IsMatch(source.sourceContent)) { output = nlpTokenGenericType.numberOrdinal; } else { if (tokenization.numbersFormatedExpr.IsMatch(source.sourceContent)) { output = nlpTokenGenericType.numberFormated; } else { if (tokenization.lettersSelect.IsMatch(source.content)) { output = nlpTokenGenericType.mixedAlfanumeric; } else { output = nlpTokenGenericType.number; } } } } else { if (tokenization.lettersSelect.IsMatch(source.content)) { // ima slova Match flw = tokenization.firstLetterWord.Match(source.content); if (flw.Success) { output = nlpTokenGenericType.unknownWord; if (source.content.Contains('@')) { if (tokenization.emailExpr.IsMatch(source.content)) { output = nlpTokenGenericType.email; } } } else { if (tokenization.selectPunctation.IsMatch(source.content)) { output = nlpTokenGenericType.mixedAlfasymbolic; // nema brojeva } else { output = nlpTokenGenericType.unknownWord; } } } else { if (tokenization.selectPunctation.IsMatch(source.content)) { output = nlpTokenGenericType.symbols; // nema brojeva } else { output = nlpTokenGenericType.unknown; } } } if (genericToBaseType(output) == nlpTokenBaseType.word) { string clean = tokenization.samoRec.Match(source.content).Value; source.content = clean; source.spliter = source.sourceContent.Replace(clean, ""); } return(output); }
/* * /// <summary> * /// Osnovna obrada na osnovu jezika> da li je poznata rec u pitanju ili nije - * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguageBasic(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * if (language.isKnownWord(token.content)) * { * token.genericType = nlpTokenGenericType.knownWord; * } * break; * case nlpTokenGenericType.number: * break; * * } * * } */ /* * /// <summary> * /// FAZA 3: Dodatna obrada tokena na osnovu jezickih podesavanja -- nije jos implementirano!!! * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguage(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * * //if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) * //{ * // token.genericType = nlpTokenGenericType.knownWord; * //} * * // token.wordVariations = languageTools.test<List<string>>(language, token.content, languageModelOperation.getVariations) as List<string>; * // List<string> stems = languageTools.test<List<string>>(language, token.content, languageModelOperation.getStems) as List<string>; * //token.wordRoot = imbStringOperations.longestCommonSubstring(token.wordVariations); * * //token.wordRoot = stems[0]; * break; * case nlpTokenGenericType.number: * * * break; * * } * * } */ /// <summary> /// FAZA 2: podesava letter case, proverava jezik, proverava da li je mozda akronim - funkcionise samo ako su detektovani slogovi /// </summary> /// <param name="token"></param> /// <param name="language"></param> /// <returns></returns> private static nlpTokenGenericType findGenericTypeSecond(IContentToken token, basicLanguage language) { nlpTokenGenericType output = token.genericType; object testOut; /* * * if (token.tokenBaseType == nlpTokenBaseType.word) * { * token.letterCase = nlpTextCase.unknown; * if (tokenization.wordWithCapitalStart.IsMatch(token.content)) token.letterCase = nlpTextCase.firstUpperRestLower; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToLower() == token.content) token.letterCase = nlpTextCase.lowerCase; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToUpper() == token.content) token.letterCase = nlpTextCase.upperCase; * if (token.letterCase == nlpTextCase.unknown) token.letterCase = nlpTextCase.mixedCase; * } */ if (token.flags == contentTokenFlag.languageWord) { if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) { token.flags = token.flags.Add(contentTokenFlag.languageKnownWord); output = nlpTokenGenericType.knownWord; } else { if (token.flags.getEnumListFromFlags().ContainsOneOrMore(contentTokenFlag.acronim, contentTokenFlag.acronimDiscovered, contentTokenFlag.acronimKnown)) { output = nlpTokenGenericType.wordAbrevation; } else { if (token.flags.HasFlag(contentTokenFlag.caseAllUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } else { token.flags = token.flags.Add(contentTokenFlag.titleOneWord); } } else if (token.flags.HasFlag(contentTokenFlag.caseFirstUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } else { if (!token.isFirst) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } } } } } } token.genericType = output; return(output); }