internal static void blockAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { if (content.items == null) { return; } foreach (IContentBlock block in content.items) { } }
public spiderWebLoader(performanceDataLoad __dataLoad = null) { webclientSettings = imbWEMManager.settings.loaderComponent.webclientSettings; crawlerFlags = crawlerAgentFlag.detectAndProcessLinkNodes | crawlerAgentFlag.detectAndProcessMetaNodes | crawlerAgentFlag.runSaveContentBlock; tokenSettings = new nlpTokenizatorSettings(); // tokenizatorEngine = new htmlSmartTokenizator(tokenSettings); trSetup = imbWEMManager.settings.contentProcessor.textRetrieve; dataLoad = __dataLoad; }
public contentSyllable(string __content, IContentToken __parent, nlpTokenizatorSettings settings) { content = __content; parent = __parent; //sourceContent = __content; Match mv = settings.vowelLastRegex.Match(content); spliter = mv.Value; type = nlpSyllableType.unknown; if (tokenization.isNumericStart.IsMatch(content)) { type = nlpSyllableType.numeric; } else if (tokenization.isLetterStart.IsMatch(content)) { if (settings.syllabLengthLimit == -1) { type = nlpSyllableType.regular; } else { if (content.Length > settings.syllabLengthLimit) { type = nlpSyllableType.irregular; } else { type = nlpSyllableType.regular; } } } else { if (content.Length > 0) { type = nlpSyllableType.symbol; } } }
/// <summary> /// OSNOVNA ANALIZA TOKENA: sprovodi od FAZE 1 do FAZE 3 - poziva nlpBase alate za svaki od tokena /// </summary> /// <param name="content"></param> /// <param name="language"></param> internal static void tokenAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { /* * if (content.tokens == null) * { * return ; * } * * // FAZA 1> * if (settings.doTokenTypeDetection_basic) * foreach (IContentToken tk in content.tokens) * { * tk.genericType = tokenCategorization.findGenericTypeBasic(tk); * } * * // FAZA 2> * if (settings.doTokenTypeDetection_second) * foreach (IContentToken tk in content.tokens) * { * * // izgradnja syllables-a * // tk.syllablesDetection(settings); * * tk.genericType = tokenCategorization.findGenericTypeSecond(tk, language); * } * * * // Faza 3 * if (settings.doTokenTypeDetection_languageBasic) * foreach (IContentToken tk in content.tokens) * { * deployTokenLanguageBasic(tk, language); * } * * * // Faza 4> * if (settings.doTokenTypeDetection_languageAdvanced) * foreach (IContentToken tk in content.tokens) * { * deployTokenLanguage(tk, language); * } */ }
/// <summary> /// Poziva detekciju generickih tipova za recenice, paragrafe i tokene. Odmrzava kolekcije u tokenizedContent /// </summary> /// <param name="tokenizedContent"></param> /// <param name="settings"></param> /// <param name="language"></param> /// <returns></returns> public static string detectGenericTypes(IContentPage tokenizedContent, nlpTokenizatorSettings settings, basicLanguage language) { if (settings == null) { return(""); } if (tokenizedContent == null) { return(""); } StringBuilder sb = new StringBuilder(); //if (settings.doTokenTypeDetection_basic) //{ // tokenCategorization.tokenAnalysis(tokenizedContent, settings, language); //} if (settings.doSentenceDetection) { blokCategorization.sentenceAnalysis(tokenizedContent, settings, language); } if (settings.doParagraphDetection) { blokCategorization.paragraphAnalysis(tokenizedContent, settings, language); } if (settings.doBlockDetection) { blokCategorization.blockAnalysis(tokenizedContent, settings, language); } /* * tokenizedContent.tokens.unfreeze(); * tokenizedContent.paragraphs.unfreeze(); * tokenizedContent.sentences.unfreeze(); * tokenizedContent.items.unfreeze(); */ return(sb.ToString()); }
//public static nlpSentenceBasicType getBasicType(nlpSentenceGenericType input) //{ // String name = input.ToString(); // if (name.StartsWith("normal")) return nlpSentenceBasicType.normal; // if (name.StartsWith("open")) return nlpSentenceBasicType.open; // if (name.StartsWith("role")) return nlpSentenceBasicType.role; // return nlpSentenceBasicType.unknown; //} internal static void sentenceAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { return; /* * foreach (IContentSentence sentence in content.sentences) * { * if (sentence.genericType == nlpSentenceGenericType.unknown) * { * String spliter = sentence.spliter.Trim(); * * Boolean firstCaseOk = (sentence.items.First().letterCase == nlpTextCase.firstUpperRestLower); * * var prevSentence = sentence.prev as IContentSentence; * if (prevSentence == null) prevSentence = sentence; * * if (prevSentence.genericType == nlpSentenceGenericType.list_startSentence) * { * sentence.genericType = nlpSentenceGenericType.list_item; * } else * { * switch (spliter) * { * case tokenization.sentenceEnd_arrowRight: * case tokenization.sentenceEnd_arrowLeft: * sentence.genericType = nlpSentenceGenericType.role_title; * break; * case tokenization.sentenceEnd_notFinished2: * case tokenization.sentenceEnd_notFinished: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_unfinished; * } * break; * case tokenization.sentenceEnd_question: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_question; * } * * break; * case tokenization.sentenceEnd_normal: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal; * } * break; * case tokenization.sentenceEnd_listStart2: * case tokenization.sentenceEnd_listStart: * sentence.genericType = nlpSentenceGenericType.list_startSentence; * break; * case tokenization.sentenceEnd_listItemEnd_listEnd: * case tokenization.sentenceEnd_listItemEnd: * sentence.genericType = nlpSentenceGenericType.list_item; * break; * case tokenization.sentenceEnd_exclamation: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_exclamation; * } * break; * default: * if (prevSentence.genericType == nlpSentenceGenericType.list_item) * { * sentence.genericType = nlpSentenceGenericType.list_item; * } * else * { * if (!String.IsNullOrEmpty(spliter)) * { * content.note(devNoteType.nlp, * "Unknown spliter for sentence: [" + spliter + * "] - add support for it in> tokenization.cs constants and sentenceAnalysis()", * "blokCategorization"); * } * } * break; * * } * } * * /* * if (sentence.genericType==nlpSentenceGenericType.unknown) * { * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_unknown; * } else * { * if (sentence.items.All(x => x.letterCase == nlpTextCase.upperCase)) * { * sentence.genericType = nlpSentenceGenericType.role_title; * } else * { * sentence.genericType = nlpSentenceGenericType.role_simpleText; * } * } * } * } * * } * * }*/ }
/// <summary> /// Vrsi analizu paragrafa - za svaki paragraf unfreeze kolekciju recenica, pokrenuti obavezno posle kategorizacije recenica /// </summary> /// <param name="content"></param> /// <param name="settings"></param> /// <param name="language"></param> internal static void paragraphAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { if (content.paragraphs == null) { return; } /* * foreach (IContentParagraph paragraph in content.paragraphs) * { * //paragraph.items.unfreeze(); * var firstSentence = paragraph.items.First(); * * if (paragraph.items.Count == 1) * { * switch (firstSentence.genericType) * { * case nlpSentenceGenericType.normal: * case nlpSentenceGenericType.normal_exclamation: * case nlpSentenceGenericType.normal_question: * case nlpSentenceGenericType.normal_unfinished: * case nlpSentenceGenericType.normal_unknown: * paragraph.genericType = nlpParagraphGenericType.textual_single; * break; * case nlpSentenceGenericType.role_title: * paragraph.genericType = nlpParagraphGenericType.textual_title; * break; * case nlpSentenceGenericType.role_simpleText: * paragraph.genericType = nlpParagraphGenericType.data_single; * break; * default: * paragraph.genericType = nlpParagraphGenericType.data_single; * break; * * } * } else * { * //var stats = paragraph.items.getRankedStats(false); * * //nlpSentenceBasicType first = stats.First().Key.convertToBasicEnum<nlpSentenceBasicType>(); * * nlpSentenceBasicType first = firstSentence.basicType; * * switch (first) * { * case nlpSentenceBasicType.normal: * switch (firstSentence.genericType) * { * case nlpSentenceGenericType.role_simpleText: * case nlpSentenceGenericType.normal_unknown: * case nlpSentenceGenericType.role_title: * paragraph.genericType = nlpParagraphGenericType.textual_article; * break; * default: * paragraph.genericType = nlpParagraphGenericType.textual; * break; * } * break; * case nlpSentenceBasicType.role: * paragraph.genericType = nlpParagraphGenericType.data_simple; * break; * default: * case nlpSentenceBasicType.unknown: * paragraph.genericType = nlpParagraphGenericType.unknown; * break; * case nlpSentenceBasicType.list: * paragraph.genericType = nlpParagraphGenericType.data_listed; * break; * } * * * } * }*/ }
/// <summary> /// pod FAZA 2.a: detektovanje slogova - poziva ga faza 2, nema potrebe posebno pozivati --- ne radi dobro /// </summary> /// <param name="token"></param> /// <param name="language"></param> public void syllablesDetection(nlpTokenizatorSettings settings) { return; /* * //token.i = new List<nlpSyllable>(); * MatchCollection coll = null; * * switch (tokenBaseType) * { * case nlpTokenBaseType.word: * if (genericType == nlpTokenGenericType.wordAbrevation) * { * coll = tokenization.samoRec.Matches(sourceContent); * } * else * { * if (settings.vowelRegex.IsMatch(content)) * { * coll = settings.vowelRegex.Matches(content); * } * } * break; * * case nlpTokenBaseType.number: * //if (genericType == nlpTokenGenericType.numberFormated) * //{ * // String[] npt = nlpTokenizator.numberFormatSymbols.Split(sourceContent); * // foreach (String smc in npt) * // { * // setItem(new nlpSyllable(smc, this, language)); * // } * //} * coll = tokenization.numericSelect.Matches(sourceContent); * break; * * case nlpTokenBaseType.mixed: * coll = tokenization.samoRec.Matches(sourceContent); * //String[] prts = nlpTokenizator.selectLetterToOtherChanges.Split(sourceContent); * //foreach (String smc in prts) * //{ * // setItem(new nlpSyllable(smc, this, language)); * //} * break; * * default: * return; * break; * } * * Int32 lastIndex = 0; * String start = ""; * String ende = ""; * contentSyllable last = null; * if (coll == null) * { * } * else * { * foreach (Match mc in coll) * { * last = setItem(new contentSyllable(mc.Value, this, settings)) as contentSyllable; * * if ((lastIndex == 0) && (mc.Index > 0)) * { * start = content.Substring(0, mc.Index); * setItem(new contentSyllable(start, this, settings)); * } * lastIndex = mc.Index + mc.Length; * * start = ""; * } * if (last != null) * { * if (lastIndex < content.Length) * { * ende = content.Substring(lastIndex); * setItem(new contentSyllable(ende, this, settings)); * } * } * * if (this.items.Count == 0) * { * last = setItem(new contentSyllable(content, this, settings)) as contentSyllable; * } * } * //syllablesLine = rebuildSyllLine(); * */ }
public xPathQueryCache _xpath_allNodesWithText; // = new xPathQueryCache() /// <summary> /// Initializes a new instance of the <see cref="htmlSmartTokenizator"/> class. /// </summary> /// <param name="__settings">The settings.</param> public htmlSmartTokenizator(nlpTokenizatorSettings __settings) : base(__settings) { }