public languageModule(ISpiderEvaluatorBase __parent, basicLanguage __langA, basicLanguage __langB) : base("Language Module", "The Targets are distributed into layers by the Passive rules and Active rules testing the tokens of the Target.", __parent) { languageA = __langA; languageB = __langB; // setup(); }
public layerLanguageTFIDF_ARule(basicLanguage __language, int __layerID, ISpiderEvaluatorBase __parent, int __layerID2 = -1) : base("Language TF-IDF Test ({0})", "Tests Target tokens against the specified language [{0}], sets layerID [{1}] and calculates layer weight score as sum of matched Target token TF-IDFs minus sum of unmatched." + "If resulting weight score is more than 0 the layerID [{1}] is assigned, if it's less than 0 then the layer2ID [{2}] is assigned", __layerID, __parent, __layerID2) { language = __language; name = string.Format(name, language.languageEnglishName); description = string.Format(description, language.languageEnglishName, layerID, layer2ID); }
/// <summary> /// Returns a basic language object with loaded dictionary file /// </summary> /// <param name="languageID">The language identifier.</param> /// <returns></returns> public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID) { if (basicLanguageRegistry.ContainsKey(languageID)) { basicLanguage output = basicLanguageRegistry[languageID]; output.checkHuspell(); return(output); } return(null); }
/// <summary> /// GLAVNA KOMANDA KOD TOKENIZACIJE - Za prosledjen paragraph pravi recenice, podrecenice i tokene. Vrsi registrovanje tokena i recenica u IContentPage output-u ako bude prosledjen /// </summary> /// <typeparam name="TSentence">Tip za recenice</typeparam> /// <typeparam name="TSubSentence">Tip za pod recenice</typeparam> /// <typeparam name="TToken">Tip za tokene</typeparam> /// <param name="paragraph"></param> /// <param name="resources">IContentPage za registraciju sadrzaja; paragraphDetectionFlags; sentenceDetectionFlags; contentPreprocessFlags;tokenDetectionFlags;tokenDetectionFlags</param> public virtual void setParagraphFromContent <TSentence, TSubSentence, TToken>(params object[] resources) where TSentence : IContentSentence, new() where TSubSentence : IContentSubSentence, new() where TToken : class, IContentToken, new() { IContentPage output = resources.getFirstOfType <IContentPage>(); basicLanguage basicLanguages = resources.getFirstOfType <basicLanguage>(); if (basicLanguages == null) { basicLanguages = new basicLanguage(); } // IContentBlock block = resources.getOfType<IContentBlock>(); paragraphDetectionFlag flags = resources.getFirstOfType <paragraphDetectionFlag>(); sentenceDetectionFlag sentenceFlags = resources.getFirstOfType <sentenceDetectionFlag>(); contentPreprocessFlag preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>(); // subsentenceDetectionFlags subsentenceFlags = new subsentenceDetectionFlags(resources); tokenDetectionFlag tokenFlags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(resources); contentSentenceCollection snt = _setSentencesFromContent <TSentence>(sentenceFlags, preprocessFlags); // sentenceDetection._setSentencesFromContent<TSentence>(paragraph, sentenceFlags, preprocessFlags); foreach (TSentence sn in snt) { // sn._setTokensForSentence<TSubSentence>(sentenceFlags, tokenFlags); var tkns = sn.setTokensFromContent <TToken, TSubSentence>(flags, sentenceFlags, preprocessFlags, tokenFlags, resources, basicLanguages); //tokenDetection.setTokensFromContent<TToken, TSubSentence>(sn, subsentenceFlags, tokenFlags); if (flags.HasFlag(paragraphDetectionFlag.dropSentenceWithNoToken)) { if (sn.items.Count == 0) { continue; } } if (sentenceFlags.HasFlag(sentenceDetectionFlag.setSentenceToParagraph)) { setItem(sn); } //if (output != null) //{ // output.sentences.Add(sn); // output.tokens.CollectAll(sn.items); //} } }
/// <summary> /// Returns a basic language object with loaded dictionary file /// </summary> /// <param name="languageID">The language identifier.</param> /// <returns></returns> public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID) { basicLanguage language = new basicLanguage(); language.affixFilePath = languageDataSet[languageID][basicLanguageParameterEnum.affixPath]; language.dictFilePath = languageDataSet[languageID][basicLanguageParameterEnum.dictPath]; language.languageNativeName = languageDataSet[languageID][basicLanguageParameterEnum.nativeName]; language.languageEnglishName = languageDataSet[languageID][basicLanguageParameterEnum.englishName]; language.iso2Code = languageDataSet[languageID][basicLanguageParameterEnum.iso2code]; language.langIDNeedles.AddRange(languageDataSet[languageID][basicLanguageParameterEnum.needles].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries)); language.checkHuspell(true); return(language); }
/// <summary> /// Poziva detekciju generickih tipova za recenice, paragrafe i tokene. Odmrzava kolekcije u tokenizedContent /// </summary> /// <param name="tokenizedContent"></param> /// <param name="settings"></param> /// <param name="language"></param> /// <returns></returns> public static string detectGenericTypes(IContentPage tokenizedContent, nlpTokenizatorSettings settings, basicLanguage language) { if (settings == null) { return(""); } if (tokenizedContent == null) { return(""); } StringBuilder sb = new StringBuilder(); //if (settings.doTokenTypeDetection_basic) //{ // tokenCategorization.tokenAnalysis(tokenizedContent, settings, language); //} if (settings.doSentenceDetection) { blokCategorization.sentenceAnalysis(tokenizedContent, settings, language); } if (settings.doParagraphDetection) { blokCategorization.paragraphAnalysis(tokenizedContent, settings, language); } if (settings.doBlockDetection) { blokCategorization.blockAnalysis(tokenizedContent, settings, language); } /* * tokenizedContent.tokens.unfreeze(); * tokenizedContent.paragraphs.unfreeze(); * tokenizedContent.sentences.unfreeze(); * tokenizedContent.items.unfreeze(); */ return(sb.ToString()); }
/// <summary> /// 2013C: Ovo je bitno da bude pozvano kako bi uspesno referencirao ovu Biblioteku!! -- > TREBA DA GA POZOVE manager.onApplicationReady() /// </summary> public static void Prepare() { aceLog.consoleControl.setAsOutput(log, "lang_mng"); if (imbNLPDataConfig.settings.DoLoadBasicLanguageDefinitions) { String hunListPath = appManager.Application.folder_resources.findFile(imbNLPDataConfig.settings.BasicLanguageDefinitionsList, SearchOption.AllDirectories); DataTable dt = hunListPath.deserializeDataTable(imbSCI.Data.enums.reporting.dataTableExportEnum.excel); //dt.Rows.GetEnumerator Parallel.ForEach <DataRow>(dt.Rows.ToList(), (rw) => { basicLanguage bl = new basicLanguage(); bl.deploy(rw); if (bl.languageEnum != basicLanguageEnum.unknown) { basicLanguageRegistry[bl.languageEnum] = bl; log.log("Hunspell dictionary entry for [" + bl.languageEnum + "] found"); } else { log.log("Hunspell dictionary entry failed [" + bl.languageEnum + "] found"); } }); } // basicLanguageRegistry[basicLanguageEnum.english].testBoolean("known", basicLanguageCheck.spellCheck); //foreach (DataRow dr in dt.Rows) //{ // basicLanguage bl = new basicLanguage(); // bl.deploy(dr); // if (bl.languageEnum != basicLanguageEnum.unknown) // { // basicLanguageRegistry[bl.languageEnum] = bl; // log.log("Hunspell dictionary entry for [" + bl.languageEnum + "] found"); // } else // { // log.log("Hunspell dictionary entry failed [" + bl.languageEnum + "] found"); // } //} // languageManagerApertium.manager.prepare(); //languageManagerAlphabet.manager.prepare(); // languageManagerDictionary.manager.prepare(); // languageManagerElements.manager.prepare(); //languageManagerHunspell.manager.prepare(); // languageManagerLexicon.manager.prepare(); // languageManagerMorphology.manager.prepare(); //languageManagerUnitex.manager.prepare(); //languageManagerWordnet.manager.prepare(); // languageManagerDict.manager.prepare(); //languageManagerDBNamedEntities.manager.prepare(); //semanticLexicon.semanticLexiconManager.manager.prepare(); aceLog.consoleControl.removeFromOutput(log); }
/// <summary> /// Vrsi tokenizaciju String/PlainText sadrzaja. /// </summary> /// <param name="resources">Preporuceni resursi: String content, basicLanguage language, node page </param> /// <returns></returns> public IContentPage tokenizeContent(params object[] resources) { string content = resources.getFirstOfType <string>(); basicLanguage language = resources.getFirstOfType <basicLanguage>(); node page = resources.getFirstOfType <node>(); contentPage output = new contentPage(); // output.sourceContent = content; output.acceptSourcePage(page); try { string source = content; // preprocess source = compressNewLines(source); output.sourceContent = source; // source = imbFilterModuleEngine.executeSimple(settings.contentFilter, source); output.content = source; string[] blocks = source.Split(new string[] { Environment.NewLine + Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); if (blocks.Count() == 0) { blocks[0] = source; } List <contentParagraph> pars = null; foreach (string bl in blocks) { string blc = bl.StripHTML(); // blc = imbStringReporting.imbHtmlDecode(blc); blc = SecurityElement.Escape(blc); contentBlock tmpBlock = new contentBlock(); tmpBlock.sourceContent = blc; tmpBlock.content = blc; output.items.Add(tmpBlock); } foreach (IContentBlock bl in output.items) { // getting paragraphs string[] paragraphs = bl.sourceContent.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); foreach (string par in paragraphs) { if (string.IsNullOrEmpty(par)) { continue; } contentParagraph po = new contentParagraph(par, output); po.setParagraphFromContent <contentSentence, contentSubSentence, contentToken>(output, paragraphDetectionFlag.dropSentenceWithNoToken, sentenceDetectionFlag. setSentenceToParagraph, sentenceDetectionFlag. preprocessParagraphContent, tokenDetectionFlag.standardDetection, contentPreprocessFlag.standard); if (po.items.Any()) { output.paragraphs.Add(po); foreach (IContentSentence sn in po.items) { output.sentences.Add(sn); foreach (IContentToken tk in sn.items) { output.tokens.Add(tk); } } //output.tokens.AddRange(); bl.setItem(po); } } } output.primaryFlaging(resources); output.secondaryFlaging(resources); output.generalSemanticsFlaging(resources); output.specialSematicsFlaging(resources); //tokenCategorization.tokenAnalysis(output, settings, language); //if (settings.doTokenTypeDetection_basic) //{ // //} //if (settings.doSentenceDetection) //{ // blokCategorization.sentenceAnalysis(output, settings, language); //} } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("plainTextTokenizator error"); isb.AppendLine("Language: " + language.toStringSafe()); // devNoteManager.note(this, ex, isb.ToString(), "plainTextTokenizator", devNoteType.tokenization); } return(output); }
public IContentPage tokenizeContent(string content, basicLanguage language) { contentPage output = new contentPage(); return(output); }
/* * /// <summary> * /// Osnovna obrada na osnovu jezika> da li je poznata rec u pitanju ili nije - * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguageBasic(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * if (language.isKnownWord(token.content)) * { * token.genericType = nlpTokenGenericType.knownWord; * } * break; * case nlpTokenGenericType.number: * break; * * } * * } */ /* * /// <summary> * /// FAZA 3: Dodatna obrada tokena na osnovu jezickih podesavanja -- nije jos implementirano!!! * /// </summary> * /// <param name="token"></param> * /// <param name="language"></param> * private static void deployTokenLanguage(IContentToken token, basicLanguage language) * { * * switch (token.genericType) * { * case nlpTokenGenericType.unknownWord: * * //if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) * //{ * // token.genericType = nlpTokenGenericType.knownWord; * //} * * // token.wordVariations = languageTools.test<List<string>>(language, token.content, languageModelOperation.getVariations) as List<string>; * // List<string> stems = languageTools.test<List<string>>(language, token.content, languageModelOperation.getStems) as List<string>; * //token.wordRoot = imbStringOperations.longestCommonSubstring(token.wordVariations); * * //token.wordRoot = stems[0]; * break; * case nlpTokenGenericType.number: * * * break; * * } * * } */ /// <summary> /// FAZA 2: podesava letter case, proverava jezik, proverava da li je mozda akronim - funkcionise samo ako su detektovani slogovi /// </summary> /// <param name="token"></param> /// <param name="language"></param> /// <returns></returns> private static nlpTokenGenericType findGenericTypeSecond(IContentToken token, basicLanguage language) { nlpTokenGenericType output = token.genericType; object testOut; /* * * if (token.tokenBaseType == nlpTokenBaseType.word) * { * token.letterCase = nlpTextCase.unknown; * if (tokenization.wordWithCapitalStart.IsMatch(token.content)) token.letterCase = nlpTextCase.firstUpperRestLower; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToLower() == token.content) token.letterCase = nlpTextCase.lowerCase; * if (token.letterCase == nlpTextCase.unknown) if (token.content.ToUpper() == token.content) token.letterCase = nlpTextCase.upperCase; * if (token.letterCase == nlpTextCase.unknown) token.letterCase = nlpTextCase.mixedCase; * } */ if (token.flags == contentTokenFlag.languageWord) { if (language.testBoolean(token.content, basicLanguageCheck.spellCheck)) { token.flags = token.flags.Add(contentTokenFlag.languageKnownWord); output = nlpTokenGenericType.knownWord; } else { if (token.flags.getEnumListFromFlags().ContainsOneOrMore(contentTokenFlag.acronim, contentTokenFlag.acronimDiscovered, contentTokenFlag.acronimKnown)) { output = nlpTokenGenericType.wordAbrevation; } else { if (token.flags.HasFlag(contentTokenFlag.caseAllUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } else { token.flags = token.flags.Add(contentTokenFlag.titleOneWord); } } else if (token.flags.HasFlag(contentTokenFlag.caseFirstUpper)) { contentToken pt = token.parent as contentToken; if (pt != null) { if (pt.flags.HasFlag(contentTokenFlag.subsentence_title)) { token.flags = token.flags.Add(contentTokenFlag.title); } else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information)) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } else { if (!token.isFirst) { token.flags = token.flags.Add(contentTokenFlag.namedEntity); } } } } } } } token.genericType = output; return(output); }
/// <summary> /// OSNOVNA ANALIZA TOKENA: sprovodi od FAZE 1 do FAZE 3 - poziva nlpBase alate za svaki od tokena /// </summary> /// <param name="content"></param> /// <param name="language"></param> internal static void tokenAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { /* * if (content.tokens == null) * { * return ; * } * * // FAZA 1> * if (settings.doTokenTypeDetection_basic) * foreach (IContentToken tk in content.tokens) * { * tk.genericType = tokenCategorization.findGenericTypeBasic(tk); * } * * // FAZA 2> * if (settings.doTokenTypeDetection_second) * foreach (IContentToken tk in content.tokens) * { * * // izgradnja syllables-a * // tk.syllablesDetection(settings); * * tk.genericType = tokenCategorization.findGenericTypeSecond(tk, language); * } * * * // Faza 3 * if (settings.doTokenTypeDetection_languageBasic) * foreach (IContentToken tk in content.tokens) * { * deployTokenLanguageBasic(tk, language); * } * * * // Faza 4> * if (settings.doTokenTypeDetection_languageAdvanced) * foreach (IContentToken tk in content.tokens) * { * deployTokenLanguage(tk, language); * } */ }
/// <summary> /// Initializes a new instance of the <see cref="ruleHasLanguageName"/> class. /// </summary> /// <param name="__parent">The parent.</param> public ruleHasLanguageName(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Language id/name in url/caption", "If language name (native or english) or iso-2-code found in url or caption", 5, 0, __parent) { language = __language; }
/// <summary> /// 2013C: Ovo je bitno da bude pozvano kako bi uspesno referencirao ovu Biblioteku!! -- > TREBA DA GA POZOVE manager.onApplicationReady() /// </summary> public static void Prepare() { aceLog.consoleControl.setAsOutput(log, "lang_mng"); //basicLanguageCollection.isReadyGlobal = true; // Int32 ln = languages.loadItems(false, 5); // log.log("Hunspell language definitions loaded [" + ln + "]"); if (_serbian == null) { _serbian = new extendedLanguage(); // _serbian.morphologies_verbs.buildDefaultItem(); // _serbian.morphologies_nouns.buildDefaultItem(); if (!File.Exists(PATH_hunspell_aff)) { throw new aceGeneralException("Hunspell AFF file missing!"); } if (!File.Exists(PATH_hunspell_dict)) { throw new aceGeneralException("Hunspell DICT file missing!"); } basicLanguage language = new basicLanguage(); language.affixFilePath = PATH_hunspell_aff; language.dictFilePath = PATH_hunspell_dict; language.languageNativeName = "Srpski"; language.languageEnglishName = "Serbian"; language.iso2Code = "sr"; _serbian.basic = language; if (language.checkHuspell(true)) { log.log("Hunspell language module: " + language.languageEnglishName + " ready"); } else { aceGeneralException axe = new aceGeneralException("Serbian language Hunspell module failed", null, english, "Serbian Hunspell failed"); throw axe; } _serbian.basic = language; _serbian.loadAlfabet("extended\\alfabet.txt"); } if (_english == null) { _english = new extendedLanguage { basic = new basicLanguage("en") }; _english.basic.affixFilePath = PATH_hunspell_aff_en; _english.basic.dictFilePath = PATH_hunspell_dict_en; _english.basic.languageNativeName = "Engleski"; _english.basic.languageEnglishName = "English"; _english.basic.iso2Code = "en"; if (_english.basic.checkHuspell(true)) { log.log("Hunspell language module: " + _english.basic.languageEnglishName + " ready"); } else { aceGeneralException axe = new aceGeneralException("English language Hunspell module failed", null, english, "English Hunspell failed"); throw axe; } } //languages.loadItems(false, -1, 0, System.Data.LoadOption.OverwriteChanges); //dictionaryManager.prepare(); //elementsManager.prepare(); languageManagerApertium.manager.prepare(); languageManagerAlphabet.manager.prepare(); // languageManagerDictionary.manager.prepare(); // languageManagerElements.manager.prepare(); languageManagerHunspell.manager.prepare(); // languageManagerLexicon.manager.prepare(); // languageManagerMorphology.manager.prepare(); languageManagerUnitex.manager.prepare(); languageManagerWordnet.manager.prepare(); // languageManagerDict.manager.prepare(); languageManagerDBNamedEntities.manager.prepare(); semanticLexicon.semanticLexiconManager.manager.prepare(); if (serbian.basic.hunspellEngine.Spell("Proba")) { log.log("Hunspell language module: " + serbian.basic.languageEnglishName + " Spell check working"); } if (english.basic.hunspellEngine.Spell("Test")) { log.log("Hunspell language module: " + english.basic.languageEnglishName + " Spell check working"); } aceLog.consoleControl.removeFromOutput(log); }
//public static nlpSentenceBasicType getBasicType(nlpSentenceGenericType input) //{ // String name = input.ToString(); // if (name.StartsWith("normal")) return nlpSentenceBasicType.normal; // if (name.StartsWith("open")) return nlpSentenceBasicType.open; // if (name.StartsWith("role")) return nlpSentenceBasicType.role; // return nlpSentenceBasicType.unknown; //} internal static void sentenceAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { return; /* * foreach (IContentSentence sentence in content.sentences) * { * if (sentence.genericType == nlpSentenceGenericType.unknown) * { * String spliter = sentence.spliter.Trim(); * * Boolean firstCaseOk = (sentence.items.First().letterCase == nlpTextCase.firstUpperRestLower); * * var prevSentence = sentence.prev as IContentSentence; * if (prevSentence == null) prevSentence = sentence; * * if (prevSentence.genericType == nlpSentenceGenericType.list_startSentence) * { * sentence.genericType = nlpSentenceGenericType.list_item; * } else * { * switch (spliter) * { * case tokenization.sentenceEnd_arrowRight: * case tokenization.sentenceEnd_arrowLeft: * sentence.genericType = nlpSentenceGenericType.role_title; * break; * case tokenization.sentenceEnd_notFinished2: * case tokenization.sentenceEnd_notFinished: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_unfinished; * } * break; * case tokenization.sentenceEnd_question: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_question; * } * * break; * case tokenization.sentenceEnd_normal: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal; * } * break; * case tokenization.sentenceEnd_listStart2: * case tokenization.sentenceEnd_listStart: * sentence.genericType = nlpSentenceGenericType.list_startSentence; * break; * case tokenization.sentenceEnd_listItemEnd_listEnd: * case tokenization.sentenceEnd_listItemEnd: * sentence.genericType = nlpSentenceGenericType.list_item; * break; * case tokenization.sentenceEnd_exclamation: * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_exclamation; * } * break; * default: * if (prevSentence.genericType == nlpSentenceGenericType.list_item) * { * sentence.genericType = nlpSentenceGenericType.list_item; * } * else * { * if (!String.IsNullOrEmpty(spliter)) * { * content.note(devNoteType.nlp, * "Unknown spliter for sentence: [" + spliter + * "] - add support for it in> tokenization.cs constants and sentenceAnalysis()", * "blokCategorization"); * } * } * break; * * } * } * * /* * if (sentence.genericType==nlpSentenceGenericType.unknown) * { * if (firstCaseOk) * { * sentence.genericType = nlpSentenceGenericType.normal_unknown; * } else * { * if (sentence.items.All(x => x.letterCase == nlpTextCase.upperCase)) * { * sentence.genericType = nlpSentenceGenericType.role_title; * } else * { * sentence.genericType = nlpSentenceGenericType.role_simpleText; * } * } * } * } * * } * * }*/ }
internal static void blockAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { if (content.items == null) { return; } foreach (IContentBlock block in content.items) { } }
/// <summary> /// paragraphDetectionFlags flags, sentenceDetectionFlags sentenceFlags, contentPreprocessFlags preprocessFlags, tokenDetectionFlags tokenFlags, String content, node page, basicLanguage language /// </summary> /// <param name="resources"></param> /// <returns></returns> public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, contentTreeGlobalCollection treeGlobalRegistry, webDocument doc, params object[] resources) { var starttime = DateTime.Now; //paragraphDetectionFlags flags = new paragraphDetectionFlags(resources); //sentenceDetectionFlags sentenceFlags = new sentenceDetectionFlags(resources); //contentPreprocessFlags preprocessFlags = new contentPreprocessFlags(resources); //tokenDetectionFlags tokenFlags = new tokenDetectionFlags(resources); string content = resources.getFirstOfType <string>(); basicLanguage language = resources.getFirstOfType <basicLanguage>(); node page = resources.getFirstOfType <node>(); // <------------------------ prepare htmlContentPage contentPage = new htmlContentPage(); //if (!imbSemanticEngineManager.settings.doEnablePageContentTokenization) //{ // return contentPage; //} contentPage.acceptSourcePage(page); string domain = page.domain; // page.url.getDomainNameFromUrl(true); // <---------- prethodna implementacija XPathNavigator navigator = doc.getDocumentNavigator(); // resources.getOfType<XPathNavigator>(); HtmlDocument hapDocument = doc.document as HtmlDocument; //List<IEnumerable<HtmlNode>> nodes = hapDocument.DocumentNode.Descendants("input").Select(y => y.Descendants().Where(x => x.InnerText != "")).ToList(); // <--------------- tree building // contentTreeGlobalCollection treeGlobalRegistry = resources.getFirstOfType< contentTreeGlobalCollection>(false, false); contentTreeBuilder ctb_old = treeGlobalRegistry.GetTreeBuilder(page.url); contentTreeBuilder ctb = null; bool buildTree = false; if (ctb_old != null) { } else { buildTree = true; } ctb = ctb_old; ctb = contentTreeBuilder.getInstance(navigator, domain, page); //ctb.saveCache(); //if (buildTree) { // // pRecordLog.log("Tree structure not found at global registry (activityJobRecord) - building new. "); //} contentPage.treeBuilder = ctb; // pRecordLog.log("Tree structure done. "); // <-------------------- tree building end imbTreeNodeBlockCollection blocks = ctb.tree.breakToBlocks(); //pRecordLog.log("Blocks extracted from tree structure: " + blocks.Count()); //flags = paragraphDetectionFlags.getDefaultFlags(); //sentenceFlags.Add(sentenceDetectionFlag.setSentenceToParagraph, // sentenceDetectionFlag.preprocessParagraphContent); //tokenFlags = tokenDetectionFlags.getDefaultFlags(); //preprocessFlags = contentPreprocessFlags.getDefaultFlags(); //pRecordLog.log(nameof(flags) + " => " + flags.toCsvInLine(";")); //pRecordLog.log(nameof(sentenceFlags) + " => " + sentenceFlags.toCsvInLine(";")); //pRecordLog.log(nameof(tokenFlags) + " => " + tokenFlags.toCsvInLine(";")); //pRecordLog.log(nameof(preprocessFlags) + " => " + preprocessFlags.toCsvInLine(";")); // pRecordLog.open(bootstrap_containers.well.ToString(), "Block structure analysis", "NLP tokenization using hybrid [" + this.GetType().Name + "] tokenization engine"); int b = 0; for (int bi = 0; bi < blocks.Count; bi++) { imbTreeNodeBlock bl = blocks[bi]; b++; makeBlock(bl, contentPage, language, resources); // pRecordLog.close(); } //pRecordLog.close(); // pRecordLog.log("Tokenized content structure done. "); contentPage.recountItems(); //pRecordLog.log("Total token counts:"); //var data = contentPage.AppendDataFields(null); //var dt = data.buildDataTable("Token statistics"); //pRecordLog.AppendTable(dt); contentPage.primaryFlaging(resources); contentPage.secondaryFlaging(resources); // <--------------- pRecordLog.log("SKIP: complete exploration of all tokens is turned off."); // contentPage.saveCache(); pRecordLog.log("Basic semantic analysis done. Closing the process."); var time = DateTime.Now.Subtract(starttime); // imbSemanticEngineManager.log.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4)+"s"); return(contentPage); }
public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, HtmlDocument htmlDoc, basicLanguage language, node page) { var starttime = DateTime.Now; htmlContentPage contentPage = new htmlContentPage(); contentPage.acceptSourcePage(page); string domain = page.domain; object[] resources = new object[] { language, page, flags, sentenceFlags, tokenFlags, preprocessFlags }; var ctb = contentTreeBuilder.getInstance(htmlDoc.CreateNavigator(), domain, page); contentPage.treeBuilder = ctb; var blocks = ctb.tree.breakToBlocks(); int b = 0; for (int bi = 0; bi < blocks.Count; bi++) { imbTreeNodeBlock bl = blocks[bi]; b++; makeBlock(bl, contentPage, language, resources); // pRecordLog.close(); } contentPage.recountItems(); contentPage.primaryFlaging(resources); contentPage.secondaryFlaging(resources); // <--------------- // pRecordLog.log("SKIP: complete exploration of all tokens is turned off."); // contentPage.saveCache(); pRecordLog.log("Basic semantic analysis done. Closing the process."); var time = DateTime.Now.Subtract(starttime); pRecordLog.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4) + "s"); return(contentPage); }
/// <summary> /// Initializes a new instance of the <see cref="ruleUrlHasKnownWords"/> class. /// </summary> /// <param name="__parent">The parent.</param> public ruleUrlHasKnownWords(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Url language words", "If url has words (3+ chars) recognized by Hunspell dictionary", 3, -1, __parent) { language = __language; }
/// <summary> /// Vrsi analizu paragrafa - za svaki paragraf unfreeze kolekciju recenica, pokrenuti obavezno posle kategorizacije recenica /// </summary> /// <param name="content"></param> /// <param name="settings"></param> /// <param name="language"></param> internal static void paragraphAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language) { if (content.paragraphs == null) { return; } /* * foreach (IContentParagraph paragraph in content.paragraphs) * { * //paragraph.items.unfreeze(); * var firstSentence = paragraph.items.First(); * * if (paragraph.items.Count == 1) * { * switch (firstSentence.genericType) * { * case nlpSentenceGenericType.normal: * case nlpSentenceGenericType.normal_exclamation: * case nlpSentenceGenericType.normal_question: * case nlpSentenceGenericType.normal_unfinished: * case nlpSentenceGenericType.normal_unknown: * paragraph.genericType = nlpParagraphGenericType.textual_single; * break; * case nlpSentenceGenericType.role_title: * paragraph.genericType = nlpParagraphGenericType.textual_title; * break; * case nlpSentenceGenericType.role_simpleText: * paragraph.genericType = nlpParagraphGenericType.data_single; * break; * default: * paragraph.genericType = nlpParagraphGenericType.data_single; * break; * * } * } else * { * //var stats = paragraph.items.getRankedStats(false); * * //nlpSentenceBasicType first = stats.First().Key.convertToBasicEnum<nlpSentenceBasicType>(); * * nlpSentenceBasicType first = firstSentence.basicType; * * switch (first) * { * case nlpSentenceBasicType.normal: * switch (firstSentence.genericType) * { * case nlpSentenceGenericType.role_simpleText: * case nlpSentenceGenericType.normal_unknown: * case nlpSentenceGenericType.role_title: * paragraph.genericType = nlpParagraphGenericType.textual_article; * break; * default: * paragraph.genericType = nlpParagraphGenericType.textual; * break; * } * break; * case nlpSentenceBasicType.role: * paragraph.genericType = nlpParagraphGenericType.data_simple; * break; * default: * case nlpSentenceBasicType.unknown: * paragraph.genericType = nlpParagraphGenericType.unknown; * break; * case nlpSentenceBasicType.list: * paragraph.genericType = nlpParagraphGenericType.data_listed; * break; * } * * * } * }*/ }
/// <summary> /// staticka kolekcija basicLanguage objekata /// </summary> // public static basicLanguageCollection languages; public static basicLanguage getLanguage(string iso2code) { if (serbian == null) { serbian = new extendedLanguage(); if (!File.Exists(PATH_hunspell_aff)) { throw new aceGeneralException("Hunspell AFF file missing!"); } if (!File.Exists(PATH_hunspell_dict)) { throw new aceGeneralException("Hunspell DICT file missing!"); } basicLanguage language = new basicLanguage(); language.affixFilePath = PATH_hunspell_aff; language.dictFilePath = PATH_hunspell_dict; language.languageNativeName = "Srpski"; language.languageEnglishName = "Serbian"; language.iso2Code = "sr"; serbian.basic = language; if (language.checkHuspell(true)) { } else { } } return(serbian.basic); //// < ---- override //String query = "iso2Code='{0}'".FormatWith(iso2code); //basicLanguage language = new basicLanguage(); //try //{ // Int32 i = languages.loadItems(true, 10, 0, System.Data.LoadOption.OverwriteChanges); // var lang = languages.selectItems<basicLanguage>(query, 1).First(); // if (i > 0) // { // // // languages.instances.FirstOrDefault(x => x.iso2Code == iso2code); // if (lang != null) // { // return lang; // } else // { // return serbian.basic; // } // } else // { // return serbian.basic; // } // //.selectItems<basicLanguage>(query, 1).First<basicLanguage>(); //} catch (Exception ex) //{ // if (getLanguageFailed) throw new aceGeneralException("Language selection failed second time", ex, serbian, "Language database never loaded"); // getLanguageFailed = true; // log.log("Language database not ready --> loading Serbian as default."); // return serbian.basic; //} //// imbLanguageFramework.imbLanguageFrameworkManager.languages.selectItemByUnique(AgentSettings.languageIsoCode) as basicLanguage; //if (language == null) //{ // String msg = "Basic language definition for [" + iso2code + // "] not found in the global languages collection (" + // imbLanguageFramework.imbLanguageFrameworkManager.languages.Count + ")"; // logSystem.log(msg, logType.FatalError, true); // throw new aceGeneralException("Language init failed"); //} //return language; }
/// <summary> /// Initializes a new instance of the <see cref="pageruleTitleKnownUniqueWords"/> class. /// </summary> /// <param name="__parent">The parent.</param> public pageruleTitleKnownUniqueWords(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Title known unique word", "If the page title contains an unique word (3+ chars long) recognized by Hunspell dictionary", 10, -5, __parent) { language = __language; }
/// <summary> /// Initializes a new instance of the <see cref="ruleKnownWordInCaption"/> class. /// </summary> /// <param name="__parent">The parent.</param> public ruleKnownWordInCaption(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Caption known words", "If all words in link caption were recognized by Hunspell dictionary", 4, -1, __parent) { language = __language; }