예제 #1
0
        /// <summary>
        /// Gets the pages with acceptable state checked with <see cref="GetPageAssertion(string)"/>, populates the list od domains.
        /// </summary>
        /// <param name="aceptableState">State of the aceptable.</param>
        /// <returns>Returns index pages and populates string list of domains</returns>
        public List <indexPage> GetPagesAndDomains(indexPageEvaluationEntryState acceptableState, out List <indexDomain> domains)
        {
            List <indexPage> output = new List <indexPage>();

            domains = new List <indexDomain>();

            foreach (indexPage page in this)
            {
                indexPageEvaluationEntryState state = GetPageAssertion(page.url);

                if (state.HasFlag(acceptableState))
                {
                    var dom = imbWEMManager.index.domainIndexTable[page.domain];

                    if (!Enumerable.Any(domains, x => x.domain == page.domain))
                    {
                        domains.Add(dom);
                    }
                    dom.addToPageSet(page);

                    output.Add(page);
                }
            }

            return(output);
        }
예제 #2
0
        /// <summary>
        /// Returns flags describing index's knowledge about this page
        /// </summary>
        /// <param name="url">The URL.</param>
        /// <returns></returns>
        public indexPageEvaluationEntryState GetPageAssertion(string url)
        {
            string key = md5.GetMd5Hash(url);

            if (!ContainsKey(key))
            {
                urlsNotInIndex.AddUnique(url);
                return(indexPageEvaluationEntryState.notInTheIndex);
            }

            indexPageEvaluationEntryState output = indexPageEvaluationEntryState.inTheIndex;

            var page = GetOrCreate(key);

            switch (page.relevancyText)
            {
            case "isRelevant":
                makeStat(true);
                output |= indexPageEvaluationEntryState.haveEvaluationEntry | indexPageEvaluationEntryState.isRelevant;
                break;

            case "notRelevant":
                makeStat(true);
                output |= indexPageEvaluationEntryState.haveEvaluationEntry | indexPageEvaluationEntryState.notRelevant;
                break;

            default:
                makeStat(false);
                output |= indexPageEvaluationEntryState.haveNoEvaluationEntry;
                break;
            }

            return(output);
        }
예제 #3
0
        /// <summary>
        /// Gets the URL assertion from collection of spider links
        /// </summary>
        /// <param name="urls">The urls.</param>
        /// <returns></returns>
        public indexURLAssertionResult GetUrlAssertion(IEnumerable <spiderLink> urls) // , out Int32 relevant, out Int32 notRelevant, out Int32 notKnown, out Int32 notInIndex
        {
            indexURLAssertionResult output = new indexURLAssertionResult();


            foreach (spiderLink url in urls)
            {
                indexPageEvaluationEntryState state = GetPageAssertion(url.url);
                output.Add(state, url.url);
            }

            return(output);
        }
예제 #4
0
        /// <summary>
        /// Gets the URL assertion and returns metrics to the provided variables
        /// </summary>
        /// <param name="urls">The urls.</param>
        /// <param name="relevant">The relevant.</param>
        /// <param name="notRelevant">The not relevant.</param>
        /// <param name="notKnown">The not known.</param>
        /// <param name="notInIndex">Index of the not in.</param>
        public indexURLAssertionResult GetUrlAssertion(IEnumerable <string> urls, indexURLAssertionResult output = null) // , out Int32 relevant, out Int32 notRelevant, out Int32 notKnown, out Int32 notInIndex
        {
            if (output == null)
            {
                output = new indexURLAssertionResult();
            }


            foreach (string url in urls)
            {
                indexPageEvaluationEntryState state = GetPageAssertion(url);
                output.Add(state, url);
            }

            return(output);
        }
예제 #5
0
        /// <summary>
        /// Attaches the page - if the page was already attached returns <c>false</c>
        /// </summary>
        /// <param name="__page">The page.</param>
        /// <returns></returns>
        public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3)
        {
            if (page != __page)
            {
                page = __page;

                HtmlDocument htmlDoc = GetHtmlDocument();


                iterationLoaded = parent.wRecord.iteration;

                if (htmlDoc != null)
                {
                    XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator();

                    pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve);

                    pageText = WebUtility.HtmlDecode(pageText);
                    pageHash = md5.GetMd5Hash(pageText);

                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree)
                    {
                        contentTree   = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc);
                        contentBlocks = contentTree.getBlocks(targetBlockCount);
                        contentBlocks.CalculateScores();
                    }


                    var ignoreTokens = parent.wRecord.domainInfo.domainWords;

                    var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText);


                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        content           = parent.dlTargetPageTokens.AddTable(key) as termDocument;
                        content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent;
                        content.AddTokens(preprocessedTokens.ToList(), response);
                    }



                    bool evaluationOk = false;

                    indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry;


                    if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode)
                    {
                        pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url);
                    }
                    else
                    {
                        pageState = indexPageEvaluationEntryState.notInTheIndex;
                    }

                    if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry))
                    {
                        evaluation = new multiLanguageEvaluation();
                        evaluation.result_language = evaluatedLanguage;
                        evaluationOk      = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant);
                        evaluatedLanguage = basicLanguageEnum.serbian;
                    }
                    else
                    {
                        evaluation        = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList());
                        evaluatedLanguage = evaluation.result_language;
                    }



                    lock (RelevantPageLock)
                    {
                        if (IsRelevant)
                        {
                            parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens);

                            parent.wRecord.relevantPages.AddUnique(__page.url);

                            parent.wRecord.tRecord.relevantPages.AddUnique(__page.url);
                        }

                        else
                        {
                            parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens);
                        }

                        parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens);
                    }



                    // <----- calling event

                    //targs.htmlDoc = htmlDoc;
                    if (parent.wRecord.context.OnTargetPageAttached != null)
                    {
                        var targs = new modelSpiderSiteRecordEventArgs(this);

                        parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs);
                    }
                }

                return(true);
            }
            return(false);
        }
예제 #6
0
        /// <summary>
        /// Samples the accept and prepare -- central sample set operation
        /// </summary>
        /// <param name="filepath">The filepath.</param>
        /// <param name="fileHasPriority">if set to <c>true</c> [file has priority].</param>
        /// <param name="group_tags">The group tags.</param>
        /// <param name="limit">The limit.</param>
        /// <param name="skip">The skip.</param>
        /// <param name="fromDomainIndex">Index of from domain.</param>
        /// <param name="fromPageIndex">Index of from page.</param>
        /// <param name="samplefilename">The samplefilename.</param>
        public void sampleAcceptAndPrepare(string filepath       = "", bool fileHasPriority                  = false, string group_tags = "", int limit = 0,
                                           int skip              = 0, indexDomainContentEnum fromDomainIndex = indexDomainContentEnum.none, indexPageEvaluationEntryState fromPageIndex = indexPageEvaluationEntryState.none,
                                           string samplefilename = "")
        {
            List <string> domains = new List <string>();
            List <string> pages   = new List <string>();
            sampleAcceptAndPrepareStates processState = sampleAcceptAndPrepareStates.started;

            analyticConsoleState state = console.state as analyticConsoleState;

            string sourcePath = filepath;

            if (!filepath.isNullOrEmpty())
            {
                processState |= sampleAcceptAndPrepareStates.filepathArgumentSupplied;
                sourcePath    = folder.findFile(filepath, SearchOption.AllDirectories, false);
                if (sourcePath.isNullOrEmpty())
                {
                    processState |= sampleAcceptAndPrepareStates.sourcePathDiscovered;
                }
                else
                {
                    processState |= sampleAcceptAndPrepareStates.sourcePathNOTFOUND;
                }
            }

            if (state.aRecord == null)
            {
                console.log("You should define job before calling this command!", true);
                return;
            }

            int startSampleCount = 0;

            if (state.sampleList != null)
            {
                startSampleCount = state.sampleList.Count();
                processState    |= sampleAcceptAndPrepareStates.existingSampleDetected;
            }
            else
            {
                state.sampleList = new webSiteSimpleSample();
            }


            state.sampleTags = group_tags;
            state.sampleFile = filepath;
            console.response.log("Sample with group_tags=" + group_tags + ", samplename=" + filepath + ", fileHasPriority=" + fileHasPriority + ".");

            // ==============================================================================================
            if (!state.sampleList.Any())
            {
                if (fromPageIndex != indexPageEvaluationEntryState.none) // -------------------------- LOADING FROM THE PAGE INDEX
                {
                    processState |= sampleAcceptAndPrepareStates.fromPageIndexImportCalled;
                    List <indexDomain> dSample = new List <indexDomain>();

                    var pageList = imbWEMManager.index.pageIndexTable.GetPagesAndDomains(fromPageIndex, out dSample);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        dSample.Randomize();
                    }
                    state.sampleList.Add(dSample);
                }
            }

            // ==============================================================================================
            if (!state.sampleList.Any())
            {
                if (fromDomainIndex != indexDomainContentEnum.none) // -------------------------- LOADING FROM THE DOMAIN INDEX
                {
                    processState |= sampleAcceptAndPrepareStates.fromDomainIndexImportCalled;
                    var list = imbWEMManager.index.domainIndexTable.GetDomainUrls(fromDomainIndex);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        list.Randomize();
                    }
                    foreach (string str in list)
                    {
                        state.sampleList.Add(str);
                    }
                }
            }

            if (!state.sampleList.Any())  // -------------------------- LOADING THE EXTERNAL SAMPLE FILE
            {
                if (!sourcePath.isNullOrEmpty())
                {
                    processState |= sampleAcceptAndPrepareStates.filesourceFoundForImportFromFile;
                    var domainList = sourcePath.openFileToList(true);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        domainList.Randomize();
                    }
                    state.sampleList.Add(domainList, skip, limit);
                    console.response.log("Sample external file list [" + samplefilename + "] found at [" + sourcePath + "] containing [" + domainList.Count() + "] domains.");
                }
                else
                {
                    processState |= sampleAcceptAndPrepareStates.filesourceNOTFOUND_ForImportFromFile;
                }
            }

            if (!state.sampleList.Any()) // -------------- LOADING THE INTERNAL SAMPLE FILE
            {
                if (!filepath.isNullOrEmptyString())
                {
                    if (fileHasPriority && sampleExist(filepath))
                    {
                        processState    |= sampleAcceptAndPrepareStates.internalSampleFilesourceLoaded;
                        state.sampleList = loadSample(filepath, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake);
                    }
                    else
                    {
                        if (state.sciProject != null) // -------------------------- LOADING FROM THE DATABASE
                        {
                            if (!group_tags.isNullOrEmpty())
                            {
                                processState |= sampleAcceptAndPrepareStates.groupTagsSpecified_databaseImportCalled;
                                //  state.sampleList = state.sciProject.getSamples(group_tags.getTokens(), limit, "stamp", 0, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake);
                            }
                        }
                        else
                        {
                            processState |= sampleAcceptAndPrepareStates.noSciProjectFound;
                        }
                    }
                }
                else
                {
                    // sample = state.sampleList;
                }
            }

            if (!state.sampleList.Any())
            {
                processState |= sampleAcceptAndPrepareStates.sampleListStillEmpty;
                var ace = new aceGeneralException("Sample creation failed: " + processState.ToString(), null, this, "Sample import failed :: ");
                throw ace;
                return;
            }

            int AddedSampleCount = state.sampleList.Count - startSampleCount;

            console.log("Added to the sample list [" + AddedSampleCount + "] at current job record [" + state.aRecord.job.name + "]", true);


            // ==============================================================================================

            if (!filepath.isNullOrEmpty()) // -------------------------- EXPORTING INTO LOCAL XML FILE
            {
                processState |= sampleAcceptAndPrepareStates.sampleListExported;
                var fi = saveSample(filepath, state.sampleList);
                console.output.log("Sample list exported to: " + fi.Name);
                state.sampleFile = fi.Name;
            }

            console.output.AppendLine("--- loged sample import procedure states: [" + processState.ToString() + "]");
        }