/// <summary> /// Gets the pages with acceptable state checked with <see cref="GetPageAssertion(string)"/>, populates the list od domains. /// </summary> /// <param name="aceptableState">State of the aceptable.</param> /// <returns>Returns index pages and populates string list of domains</returns> public List <indexPage> GetPagesAndDomains(indexPageEvaluationEntryState acceptableState, out List <indexDomain> domains) { List <indexPage> output = new List <indexPage>(); domains = new List <indexDomain>(); foreach (indexPage page in this) { indexPageEvaluationEntryState state = GetPageAssertion(page.url); if (state.HasFlag(acceptableState)) { var dom = imbWEMManager.index.domainIndexTable[page.domain]; if (!Enumerable.Any(domains, x => x.domain == page.domain)) { domains.Add(dom); } dom.addToPageSet(page); output.Add(page); } } return(output); }
/// <summary> /// Returns flags describing index's knowledge about this page /// </summary> /// <param name="url">The URL.</param> /// <returns></returns> public indexPageEvaluationEntryState GetPageAssertion(string url) { string key = md5.GetMd5Hash(url); if (!ContainsKey(key)) { urlsNotInIndex.AddUnique(url); return(indexPageEvaluationEntryState.notInTheIndex); } indexPageEvaluationEntryState output = indexPageEvaluationEntryState.inTheIndex; var page = GetOrCreate(key); switch (page.relevancyText) { case "isRelevant": makeStat(true); output |= indexPageEvaluationEntryState.haveEvaluationEntry | indexPageEvaluationEntryState.isRelevant; break; case "notRelevant": makeStat(true); output |= indexPageEvaluationEntryState.haveEvaluationEntry | indexPageEvaluationEntryState.notRelevant; break; default: makeStat(false); output |= indexPageEvaluationEntryState.haveNoEvaluationEntry; break; } return(output); }
/// <summary> /// Gets the URL assertion from collection of spider links /// </summary> /// <param name="urls">The urls.</param> /// <returns></returns> public indexURLAssertionResult GetUrlAssertion(IEnumerable <spiderLink> urls) // , out Int32 relevant, out Int32 notRelevant, out Int32 notKnown, out Int32 notInIndex { indexURLAssertionResult output = new indexURLAssertionResult(); foreach (spiderLink url in urls) { indexPageEvaluationEntryState state = GetPageAssertion(url.url); output.Add(state, url.url); } return(output); }
/// <summary> /// Gets the URL assertion and returns metrics to the provided variables /// </summary> /// <param name="urls">The urls.</param> /// <param name="relevant">The relevant.</param> /// <param name="notRelevant">The not relevant.</param> /// <param name="notKnown">The not known.</param> /// <param name="notInIndex">Index of the not in.</param> public indexURLAssertionResult GetUrlAssertion(IEnumerable <string> urls, indexURLAssertionResult output = null) // , out Int32 relevant, out Int32 notRelevant, out Int32 notKnown, out Int32 notInIndex { if (output == null) { output = new indexURLAssertionResult(); } foreach (string url in urls) { indexPageEvaluationEntryState state = GetPageAssertion(url); output.Add(state, url); } return(output); }
/// <summary> /// Attaches the page - if the page was already attached returns <c>false</c> /// </summary> /// <param name="__page">The page.</param> /// <returns></returns> public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3) { if (page != __page) { page = __page; HtmlDocument htmlDoc = GetHtmlDocument(); iterationLoaded = parent.wRecord.iteration; if (htmlDoc != null) { XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator(); pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve); pageText = WebUtility.HtmlDecode(pageText); pageHash = md5.GetMd5Hash(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree) { contentTree = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc); contentBlocks = contentTree.getBlocks(targetBlockCount); contentBlocks.CalculateScores(); } var ignoreTokens = parent.wRecord.domainInfo.domainWords; var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { content = parent.dlTargetPageTokens.AddTable(key) as termDocument; content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent; content.AddTokens(preprocessedTokens.ToList(), response); } bool evaluationOk = false; indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry; if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode) { pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url); } else { pageState = indexPageEvaluationEntryState.notInTheIndex; } if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry)) { evaluation = new multiLanguageEvaluation(); evaluation.result_language = evaluatedLanguage; evaluationOk = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant); evaluatedLanguage = basicLanguageEnum.serbian; } else { evaluation = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList()); evaluatedLanguage = evaluation.result_language; } lock (RelevantPageLock) { if (IsRelevant) { parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens); parent.wRecord.relevantPages.AddUnique(__page.url); parent.wRecord.tRecord.relevantPages.AddUnique(__page.url); } else { parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens); } parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens); } // <----- calling event //targs.htmlDoc = htmlDoc; if (parent.wRecord.context.OnTargetPageAttached != null) { var targs = new modelSpiderSiteRecordEventArgs(this); parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs); } } return(true); } return(false); }
/// <summary> /// Samples the accept and prepare -- central sample set operation /// </summary> /// <param name="filepath">The filepath.</param> /// <param name="fileHasPriority">if set to <c>true</c> [file has priority].</param> /// <param name="group_tags">The group tags.</param> /// <param name="limit">The limit.</param> /// <param name="skip">The skip.</param> /// <param name="fromDomainIndex">Index of from domain.</param> /// <param name="fromPageIndex">Index of from page.</param> /// <param name="samplefilename">The samplefilename.</param> public void sampleAcceptAndPrepare(string filepath = "", bool fileHasPriority = false, string group_tags = "", int limit = 0, int skip = 0, indexDomainContentEnum fromDomainIndex = indexDomainContentEnum.none, indexPageEvaluationEntryState fromPageIndex = indexPageEvaluationEntryState.none, string samplefilename = "") { List <string> domains = new List <string>(); List <string> pages = new List <string>(); sampleAcceptAndPrepareStates processState = sampleAcceptAndPrepareStates.started; analyticConsoleState state = console.state as analyticConsoleState; string sourcePath = filepath; if (!filepath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.filepathArgumentSupplied; sourcePath = folder.findFile(filepath, SearchOption.AllDirectories, false); if (sourcePath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.sourcePathDiscovered; } else { processState |= sampleAcceptAndPrepareStates.sourcePathNOTFOUND; } } if (state.aRecord == null) { console.log("You should define job before calling this command!", true); return; } int startSampleCount = 0; if (state.sampleList != null) { startSampleCount = state.sampleList.Count(); processState |= sampleAcceptAndPrepareStates.existingSampleDetected; } else { state.sampleList = new webSiteSimpleSample(); } state.sampleTags = group_tags; state.sampleFile = filepath; console.response.log("Sample with group_tags=" + group_tags + ", samplename=" + filepath + ", fileHasPriority=" + fileHasPriority + "."); // ============================================================================================== if (!state.sampleList.Any()) { if (fromPageIndex != indexPageEvaluationEntryState.none) // -------------------------- LOADING FROM THE PAGE INDEX { processState |= sampleAcceptAndPrepareStates.fromPageIndexImportCalled; List <indexDomain> dSample = new List <indexDomain>(); var pageList = imbWEMManager.index.pageIndexTable.GetPagesAndDomains(fromPageIndex, out dSample); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { dSample.Randomize(); } state.sampleList.Add(dSample); } } // ============================================================================================== if (!state.sampleList.Any()) { if (fromDomainIndex != indexDomainContentEnum.none) // -------------------------- LOADING FROM THE DOMAIN INDEX { processState |= sampleAcceptAndPrepareStates.fromDomainIndexImportCalled; var list = imbWEMManager.index.domainIndexTable.GetDomainUrls(fromDomainIndex); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { list.Randomize(); } foreach (string str in list) { state.sampleList.Add(str); } } } if (!state.sampleList.Any()) // -------------------------- LOADING THE EXTERNAL SAMPLE FILE { if (!sourcePath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.filesourceFoundForImportFromFile; var domainList = sourcePath.openFileToList(true); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { domainList.Randomize(); } state.sampleList.Add(domainList, skip, limit); console.response.log("Sample external file list [" + samplefilename + "] found at [" + sourcePath + "] containing [" + domainList.Count() + "] domains."); } else { processState |= sampleAcceptAndPrepareStates.filesourceNOTFOUND_ForImportFromFile; } } if (!state.sampleList.Any()) // -------------- LOADING THE INTERNAL SAMPLE FILE { if (!filepath.isNullOrEmptyString()) { if (fileHasPriority && sampleExist(filepath)) { processState |= sampleAcceptAndPrepareStates.internalSampleFilesourceLoaded; state.sampleList = loadSample(filepath, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake); } else { if (state.sciProject != null) // -------------------------- LOADING FROM THE DATABASE { if (!group_tags.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.groupTagsSpecified_databaseImportCalled; // state.sampleList = state.sciProject.getSamples(group_tags.getTokens(), limit, "stamp", 0, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake); } } else { processState |= sampleAcceptAndPrepareStates.noSciProjectFound; } } } else { // sample = state.sampleList; } } if (!state.sampleList.Any()) { processState |= sampleAcceptAndPrepareStates.sampleListStillEmpty; var ace = new aceGeneralException("Sample creation failed: " + processState.ToString(), null, this, "Sample import failed :: "); throw ace; return; } int AddedSampleCount = state.sampleList.Count - startSampleCount; console.log("Added to the sample list [" + AddedSampleCount + "] at current job record [" + state.aRecord.job.name + "]", true); // ============================================================================================== if (!filepath.isNullOrEmpty()) // -------------------------- EXPORTING INTO LOCAL XML FILE { processState |= sampleAcceptAndPrepareStates.sampleListExported; var fi = saveSample(filepath, state.sampleList); console.output.log("Sample list exported to: " + fi.Name); state.sampleFile = fi.Name; } console.output.AppendLine("--- loged sample import procedure states: [" + processState.ToString() + "]"); }