Пример #1
0
        public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, HtmlDocument htmlDoc, basicLanguage language, node page)
        {
            var starttime = DateTime.Now;

            htmlContentPage contentPage = new htmlContentPage();

            contentPage.acceptSourcePage(page);

            string domain = page.domain;

            object[] resources = new object[] { language, page, flags, sentenceFlags, tokenFlags, preprocessFlags };

            var ctb = contentTreeBuilder.getInstance(htmlDoc.CreateNavigator(), domain, page);

            contentPage.treeBuilder = ctb;
            var blocks = ctb.tree.breakToBlocks();

            int b = 0;

            for (int bi = 0; bi < blocks.Count; bi++)
            {
                imbTreeNodeBlock bl = blocks[bi];
                b++;
                makeBlock(bl, contentPage, language, resources);
                // pRecordLog.close();
            }

            contentPage.recountItems();

            contentPage.primaryFlaging(resources);
            contentPage.secondaryFlaging(resources);



            // <---------------

            //  pRecordLog.log("SKIP: complete exploration of all tokens is turned off.");



            // contentPage.saveCache();


            pRecordLog.log("Basic semantic analysis done. Closing the process.");


            var time = DateTime.Now.Subtract(starttime);

            pRecordLog.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4) + "s");

            return(contentPage);
        }
Пример #2
0
        /// <summary>
        /// Processes the specified page.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <returns></returns>
        public linkList process(crawledPage page, Boolean isLinkStackEmpty)
        {
            linkList output    = new linkList();
            linkList secOutput = new linkList();

            // pages.Add(page);

            if (page == null)
            {
                isLinkCollectingDone = true;
                return(null); //getResult();
            }

            if (CountToTarget < 1)
            {
                isLinkCollectingDone = true;
                return(null); //getResult();
            }

            if (iLimit < 0)
            {
                isLinkCollectingDone = true;
                return(null); //getResult();
            }

            htmlContentPage hContent = page.tokenizedContent as htmlContentPage;

            if (hContent != null)
            {
                htmlLinkNodeCollection linkNodes = new htmlLinkNodeCollection(hContent.tokens);


                var lnk = linkNodes.getSorted();
                foreach (htmlLinkNode ln in lnk)
                {
                    link crawledLink = null;
                    if (page.links.byUrl.ContainsKey(ln.url))
                    {
                        crawledLink = page.links.byUrl[ln.url];
                    }

                    var cwl = Add(crawledLink);
                    if (cwl != null)
                    {
                        if (ln.isPrimary)
                        {
                            primary.Add(crawledLink);
                            output.Add(cwl);
                        }
                        else
                        {
                            secondary.Add(crawledLink);
                            secOutput.Add(crawledLink);
                        }
                    }
                }
            }
            else
            {
            }

            // Int32 cc = CountToTarget - output.Count();


            if (!output.Any())
            {
                if (isLinkStackEmpty)
                {
                    collectionExtensions.AddMulti(output, secondary);
                }
            }

            iLimit--;

            //if (output.Count() == 0)
            //{
            //    isLinkCollectingDone = true;
            //    return getResult();
            //}

            return(output);
        }
Пример #3
0
        /// <summary>
        /// paragraphDetectionFlags flags, sentenceDetectionFlags sentenceFlags, contentPreprocessFlags preprocessFlags,  tokenDetectionFlags tokenFlags,  String content,  node page, basicLanguage language
        /// </summary>
        /// <param name="resources"></param>
        /// <returns></returns>
        public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, contentTreeGlobalCollection treeGlobalRegistry, webDocument doc, params object[] resources)
        {
            var starttime = DateTime.Now;
            //paragraphDetectionFlags flags = new paragraphDetectionFlags(resources);
            //sentenceDetectionFlags sentenceFlags = new sentenceDetectionFlags(resources);
            //contentPreprocessFlags preprocessFlags = new contentPreprocessFlags(resources);
            //tokenDetectionFlags tokenFlags = new tokenDetectionFlags(resources);


            string        content  = resources.getFirstOfType <string>();
            basicLanguage language = resources.getFirstOfType <basicLanguage>();
            node          page     = resources.getFirstOfType <node>();

            // <------------------------ prepare

            htmlContentPage contentPage = new htmlContentPage();

            //if (!imbSemanticEngineManager.settings.doEnablePageContentTokenization)
            //{
            //    return contentPage;
            //}

            contentPage.acceptSourcePage(page);


            string domain = page.domain; // page.url.getDomainNameFromUrl(true);

            // <---------- prethodna implementacija
            XPathNavigator navigator = doc.getDocumentNavigator();  // resources.getOfType<XPathNavigator>();

            HtmlDocument hapDocument = doc.document as HtmlDocument;

            //List<IEnumerable<HtmlNode>> nodes = hapDocument.DocumentNode.Descendants("input").Select(y => y.Descendants().Where(x => x.InnerText != "")).ToList();

            // <--------------- tree building
            // contentTreeGlobalCollection treeGlobalRegistry = resources.getFirstOfType< contentTreeGlobalCollection>(false, false);

            contentTreeBuilder ctb_old = treeGlobalRegistry.GetTreeBuilder(page.url);
            contentTreeBuilder ctb     = null;
            bool buildTree             = false;

            if (ctb_old != null)
            {
            }
            else
            {
                buildTree = true;
            }

            ctb = ctb_old;


            ctb = contentTreeBuilder.getInstance(navigator, domain, page);

            //ctb.saveCache();

            //if (buildTree) {
            //   // pRecordLog.log("Tree structure not found at global registry (activityJobRecord) - building new. ");

            //}
            contentPage.treeBuilder = ctb;
            //  pRecordLog.log("Tree structure done. ");

            // <-------------------- tree building end

            imbTreeNodeBlockCollection blocks = ctb.tree.breakToBlocks();
            //pRecordLog.log("Blocks extracted from tree structure: " + blocks.Count());

            //flags = paragraphDetectionFlags.getDefaultFlags();
            //sentenceFlags.Add(sentenceDetectionFlag.setSentenceToParagraph,
            //                  sentenceDetectionFlag.preprocessParagraphContent);
            //tokenFlags = tokenDetectionFlags.getDefaultFlags();
            //preprocessFlags = contentPreprocessFlags.getDefaultFlags();

            //pRecordLog.log(nameof(flags) + " => " + flags.toCsvInLine(";"));
            //pRecordLog.log(nameof(sentenceFlags) + " => " + sentenceFlags.toCsvInLine(";"));
            //pRecordLog.log(nameof(tokenFlags) + " => " + tokenFlags.toCsvInLine(";"));
            //pRecordLog.log(nameof(preprocessFlags) + " => " + preprocessFlags.toCsvInLine(";"));


            // pRecordLog.open(bootstrap_containers.well.ToString(), "Block structure analysis", "NLP tokenization using hybrid [" + this.GetType().Name + "] tokenization engine");



            int b = 0;

            for (int bi = 0; bi < blocks.Count; bi++)
            {
                imbTreeNodeBlock bl = blocks[bi];
                b++;
                makeBlock(bl, contentPage, language, resources);
                // pRecordLog.close();
            }

            //pRecordLog.close();

            // pRecordLog.log("Tokenized content structure done. ");


            contentPage.recountItems();

            //pRecordLog.log("Total token counts:");
            //var data = contentPage.AppendDataFields(null);
            //var dt = data.buildDataTable("Token statistics");
            //pRecordLog.AppendTable(dt);

            contentPage.primaryFlaging(resources);
            contentPage.secondaryFlaging(resources);

            // <---------------

            pRecordLog.log("SKIP: complete exploration of all tokens is turned off.");



            // contentPage.saveCache();


            pRecordLog.log("Basic semantic analysis done. Closing the process.");


            var time = DateTime.Now.Subtract(starttime);

            // imbSemanticEngineManager.log.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4)+"s");
            return(contentPage);
        }