public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, HtmlDocument htmlDoc, basicLanguage language, node page) { var starttime = DateTime.Now; htmlContentPage contentPage = new htmlContentPage(); contentPage.acceptSourcePage(page); string domain = page.domain; object[] resources = new object[] { language, page, flags, sentenceFlags, tokenFlags, preprocessFlags }; var ctb = contentTreeBuilder.getInstance(htmlDoc.CreateNavigator(), domain, page); contentPage.treeBuilder = ctb; var blocks = ctb.tree.breakToBlocks(); int b = 0; for (int bi = 0; bi < blocks.Count; bi++) { imbTreeNodeBlock bl = blocks[bi]; b++; makeBlock(bl, contentPage, language, resources); // pRecordLog.close(); } contentPage.recountItems(); contentPage.primaryFlaging(resources); contentPage.secondaryFlaging(resources); // <--------------- // pRecordLog.log("SKIP: complete exploration of all tokens is turned off."); // contentPage.saveCache(); pRecordLog.log("Basic semantic analysis done. Closing the process."); var time = DateTime.Now.Subtract(starttime); pRecordLog.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4) + "s"); return(contentPage); }
/// <summary> /// paragraphDetectionFlags flags, sentenceDetectionFlags sentenceFlags, contentPreprocessFlags preprocessFlags, tokenDetectionFlags tokenFlags, String content, node page, basicLanguage language /// </summary> /// <param name="resources"></param> /// <returns></returns> public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, contentTreeGlobalCollection treeGlobalRegistry, webDocument doc, params object[] resources) { var starttime = DateTime.Now; //paragraphDetectionFlags flags = new paragraphDetectionFlags(resources); //sentenceDetectionFlags sentenceFlags = new sentenceDetectionFlags(resources); //contentPreprocessFlags preprocessFlags = new contentPreprocessFlags(resources); //tokenDetectionFlags tokenFlags = new tokenDetectionFlags(resources); string content = resources.getFirstOfType <string>(); basicLanguage language = resources.getFirstOfType <basicLanguage>(); node page = resources.getFirstOfType <node>(); // <------------------------ prepare htmlContentPage contentPage = new htmlContentPage(); //if (!imbSemanticEngineManager.settings.doEnablePageContentTokenization) //{ // return contentPage; //} contentPage.acceptSourcePage(page); string domain = page.domain; // page.url.getDomainNameFromUrl(true); // <---------- prethodna implementacija XPathNavigator navigator = doc.getDocumentNavigator(); // resources.getOfType<XPathNavigator>(); HtmlDocument hapDocument = doc.document as HtmlDocument; //List<IEnumerable<HtmlNode>> nodes = hapDocument.DocumentNode.Descendants("input").Select(y => y.Descendants().Where(x => x.InnerText != "")).ToList(); // <--------------- tree building // contentTreeGlobalCollection treeGlobalRegistry = resources.getFirstOfType< contentTreeGlobalCollection>(false, false); contentTreeBuilder ctb_old = treeGlobalRegistry.GetTreeBuilder(page.url); contentTreeBuilder ctb = null; bool buildTree = false; if (ctb_old != null) { } else { buildTree = true; } ctb = ctb_old; ctb = contentTreeBuilder.getInstance(navigator, domain, page); //ctb.saveCache(); //if (buildTree) { // // pRecordLog.log("Tree structure not found at global registry (activityJobRecord) - building new. "); //} contentPage.treeBuilder = ctb; // pRecordLog.log("Tree structure done. "); // <-------------------- tree building end imbTreeNodeBlockCollection blocks = ctb.tree.breakToBlocks(); //pRecordLog.log("Blocks extracted from tree structure: " + blocks.Count()); //flags = paragraphDetectionFlags.getDefaultFlags(); //sentenceFlags.Add(sentenceDetectionFlag.setSentenceToParagraph, // sentenceDetectionFlag.preprocessParagraphContent); //tokenFlags = tokenDetectionFlags.getDefaultFlags(); //preprocessFlags = contentPreprocessFlags.getDefaultFlags(); //pRecordLog.log(nameof(flags) + " => " + flags.toCsvInLine(";")); //pRecordLog.log(nameof(sentenceFlags) + " => " + sentenceFlags.toCsvInLine(";")); //pRecordLog.log(nameof(tokenFlags) + " => " + tokenFlags.toCsvInLine(";")); //pRecordLog.log(nameof(preprocessFlags) + " => " + preprocessFlags.toCsvInLine(";")); // pRecordLog.open(bootstrap_containers.well.ToString(), "Block structure analysis", "NLP tokenization using hybrid [" + this.GetType().Name + "] tokenization engine"); int b = 0; for (int bi = 0; bi < blocks.Count; bi++) { imbTreeNodeBlock bl = blocks[bi]; b++; makeBlock(bl, contentPage, language, resources); // pRecordLog.close(); } //pRecordLog.close(); // pRecordLog.log("Tokenized content structure done. "); contentPage.recountItems(); //pRecordLog.log("Total token counts:"); //var data = contentPage.AppendDataFields(null); //var dt = data.buildDataTable("Token statistics"); //pRecordLog.AppendTable(dt); contentPage.primaryFlaging(resources); contentPage.secondaryFlaging(resources); // <--------------- pRecordLog.log("SKIP: complete exploration of all tokens is turned off."); // contentPage.saveCache(); pRecordLog.log("Basic semantic analysis done. Closing the process."); var time = DateTime.Now.Subtract(starttime); // imbSemanticEngineManager.log.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4)+"s"); return(contentPage); }