///// <summary> ///// Sets the site tf compiled. ///// </summary> ///// <param name="compiledTF">The compiled tf.</param> //public void SetSiteTFCompiled(termDocumentSet compiledTF, String domain) //{ // indexDomain idomain = GetDomain(domain); // //objectSerialization.saveObjectToXML(compiledTF.GetAggregateDataTable(), GetTFFileName(idomain)); // compiledTF.AggregateDocument.name = idomain.HashCode; // /* // IWeightTable domainDocument = imbWEMManager.index.experimentManager.globalTFIDFSet.AddTable(idomain.HashCode); // domainDocument.AddExternalDocument(compiledTF.AggregateDocument, true); // */ // AddOrUpdate(domain); //} ///// <summary> ///// Gets the site tf for build. ///// </summary> ///// <param name="domainName">Name of the domain.</param> ///// <returns></returns> //public webSitePageTFSet GetSiteTFForBuild(String domainName) //{ // indexDomain idomain = GetDomain(domainName); // String path = GetTFFileName(idomain); // webSitePageTFSet output = new webSitePageTFSet(domainName, "TFIDF table set for this domain"); // return output; //} /* * /// <summary> * /// Provides feedback if the precompiled TF-IDF exists * /// </summary> * /// <param name="domainName">Name of the domain.</param> * /// <returns></returns> * public Boolean GetSiteLemmaTFExists(String domainName) * { * indexDomain idomain = GetDomain(domainName); * * * * //String file = tf_folder.findFile("*" + idomain.HashCode + "*.xml"); * idomain.TFIDFcompiled = !file.isNullOrEmpty(); * AddOrUpdate(idomain); * * return idomain.TFIDFcompiled; * } */ ///// <summary> ///// Gets the site lemma tf for use. ///// </summary> ///// <param name="domainName">Name of the domain.</param> ///// <returns></returns> //public weightTableCompiled GetSiteLemmaTFForUse(String domainName) //{ // indexDomain idomain = GetDomain(domainName); // FileInfo file = //tf_folder.findFile("*" + idomain.HashCode + "*.xml"); // Boolean found = !file.isNullOrEmpty(); // if (found) // { // weightTableCompiled tf_lemmaCompiled = new weightTableCompiled(file, true, domainName); // return tf_lemmaCompiled; // } else // { // //String path = GetTFFileName(idomain); // //bSitePageTFSet output = new webSitePageTFSet(domainName, ""); // return null; // } //} /// <summary> /// Gets the domain urls. /// </summary> /// <param name="contentType">Type of the content.</param> /// <returns></returns> public List <string> GetDomainUrls(indexDomainContentEnum contentType) { List <indexDomain> output = new List <indexDomain>(); List <string> urls = new List <string>(); foreach (indexDomain iDomain in this) { if (iDomain.contentType == contentType) { urls.Add(iDomain.url); } } return(urls); }
/// <summary> /// Gets the domains having specified contentType set /// </summary> /// <param name="contentType">Type of the content.</param> /// <returns></returns> public List <indexDomain> GetDomains(indexDomainContentEnum contentType) { List <indexDomain> output = new List <indexDomain>(); foreach (indexDomain iDomain in this) { if (contentType == indexDomainContentEnum.any) { output.Add(iDomain); } else { if (iDomain.contentType == contentType) { output.Add(iDomain); } } } return(output); }
/// <summary> /// Performs domain index assertion /// </summary> /// <param name="domainList">The domain list.</param> /// <param name="completeRecheck">if set to <c>true</c> [complete recheck].</param> /// <returns></returns> public indexDomainAssertionResult GetDomainIndexAssertion(List <string> domainList = null, bool completeRecheck = false) { indexDomainAssertionResult output = new indexDomainAssertionResult(); List <indexDomain> iList = GetList(); if (domainList == null) { domainList = new List <string>(); } if (!domainList.Any()) { iList.ForEach(x => domainList.Add(x.domain)); //domainList = GetDomainUrls(indexDomainContentEnum.indexed); } double IPs = 0; int Lemmas = 0; foreach (string domainUrl in domainList) { indexDomainContentEnum flags = indexDomainContentEnum.none; indexDomain idomain = GetDomain(domainUrl); if (idomain == null) { output.Add(flags, domainUrl); continue; } else { flags = indexDomainContentEnum.indexed; List <indexPage> pageList = imbWEMManager.index.pageIndexTable.GetPagesForDomain(domainUrl); if (completeRecheck) { idomain.recheck(pageList); } List <string> pageUrls = new List <string>(); pageList.ForEach(x => pageUrls.Add(x.url)); indexURLAssertionResult pageListResult = imbWEMManager.index.pageIndexTable.GetUrlAssertion(pageUrls); if (pageListResult[indexPageEvaluationEntryState.inTheIndex].Count() == pageListResult[indexPageEvaluationEntryState.haveEvaluationEntry].Count()) { flags |= indexDomainContentEnum.completeEvaluationPages; } else { flags |= indexDomainContentEnum.uncompleteEvaluationPages; } FileInfo dlc_tf_idf = imbWEMManager.index.experimentManager.CurrentSession.GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); if (dlc_tf_idf.Exists) { flags |= indexDomainContentEnum.uncompleteDomainTFIDF; idomain.TFIDFcompiled = false; } else { flags |= indexDomainContentEnum.completeDomainTFIDF; idomain.TFIDFcompiled = true; } bool appUncomplete = false; double IPd = 0; foreach (indexPage p in pageList) { if ((p.Lemmas == 0) && (p.InfoPrize == 0) && (p.DistinctLemmas.isNullOrEmpty())) { appUncomplete = true; } //IPd += p.InfoPrize; } if (appUncomplete) { flags |= indexDomainContentEnum.uncompleteTFDFApplicationToPages; } else { flags |= indexDomainContentEnum.completeTFDFApplicationToPages; } // idomain.InfoPrize = IPd; output.Add(flags, domainUrl); // AddOrUpdate(idomain); } } return(output); }
/// <summary> /// Samples the accept and prepare -- central sample set operation /// </summary> /// <param name="filepath">The filepath.</param> /// <param name="fileHasPriority">if set to <c>true</c> [file has priority].</param> /// <param name="group_tags">The group tags.</param> /// <param name="limit">The limit.</param> /// <param name="skip">The skip.</param> /// <param name="fromDomainIndex">Index of from domain.</param> /// <param name="fromPageIndex">Index of from page.</param> /// <param name="samplefilename">The samplefilename.</param> public void sampleAcceptAndPrepare(string filepath = "", bool fileHasPriority = false, string group_tags = "", int limit = 0, int skip = 0, indexDomainContentEnum fromDomainIndex = indexDomainContentEnum.none, indexPageEvaluationEntryState fromPageIndex = indexPageEvaluationEntryState.none, string samplefilename = "") { List <string> domains = new List <string>(); List <string> pages = new List <string>(); sampleAcceptAndPrepareStates processState = sampleAcceptAndPrepareStates.started; analyticConsoleState state = console.state as analyticConsoleState; string sourcePath = filepath; if (!filepath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.filepathArgumentSupplied; sourcePath = folder.findFile(filepath, SearchOption.AllDirectories, false); if (sourcePath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.sourcePathDiscovered; } else { processState |= sampleAcceptAndPrepareStates.sourcePathNOTFOUND; } } if (state.aRecord == null) { console.log("You should define job before calling this command!", true); return; } int startSampleCount = 0; if (state.sampleList != null) { startSampleCount = state.sampleList.Count(); processState |= sampleAcceptAndPrepareStates.existingSampleDetected; } else { state.sampleList = new webSiteSimpleSample(); } state.sampleTags = group_tags; state.sampleFile = filepath; console.response.log("Sample with group_tags=" + group_tags + ", samplename=" + filepath + ", fileHasPriority=" + fileHasPriority + "."); // ============================================================================================== if (!state.sampleList.Any()) { if (fromPageIndex != indexPageEvaluationEntryState.none) // -------------------------- LOADING FROM THE PAGE INDEX { processState |= sampleAcceptAndPrepareStates.fromPageIndexImportCalled; List <indexDomain> dSample = new List <indexDomain>(); var pageList = imbWEMManager.index.pageIndexTable.GetPagesAndDomains(fromPageIndex, out dSample); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { dSample.Randomize(); } state.sampleList.Add(dSample); } } // ============================================================================================== if (!state.sampleList.Any()) { if (fromDomainIndex != indexDomainContentEnum.none) // -------------------------- LOADING FROM THE DOMAIN INDEX { processState |= sampleAcceptAndPrepareStates.fromDomainIndexImportCalled; var list = imbWEMManager.index.domainIndexTable.GetDomainUrls(fromDomainIndex); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { list.Randomize(); } foreach (string str in list) { state.sampleList.Add(str); } } } if (!state.sampleList.Any()) // -------------------------- LOADING THE EXTERNAL SAMPLE FILE { if (!sourcePath.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.filesourceFoundForImportFromFile; var domainList = sourcePath.openFileToList(true); if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake) { domainList.Randomize(); } state.sampleList.Add(domainList, skip, limit); console.response.log("Sample external file list [" + samplefilename + "] found at [" + sourcePath + "] containing [" + domainList.Count() + "] domains."); } else { processState |= sampleAcceptAndPrepareStates.filesourceNOTFOUND_ForImportFromFile; } } if (!state.sampleList.Any()) // -------------- LOADING THE INTERNAL SAMPLE FILE { if (!filepath.isNullOrEmptyString()) { if (fileHasPriority && sampleExist(filepath)) { processState |= sampleAcceptAndPrepareStates.internalSampleFilesourceLoaded; state.sampleList = loadSample(filepath, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake); } else { if (state.sciProject != null) // -------------------------- LOADING FROM THE DATABASE { if (!group_tags.isNullOrEmpty()) { processState |= sampleAcceptAndPrepareStates.groupTagsSpecified_databaseImportCalled; // state.sampleList = state.sciProject.getSamples(group_tags.getTokens(), limit, "stamp", 0, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake); } } else { processState |= sampleAcceptAndPrepareStates.noSciProjectFound; } } } else { // sample = state.sampleList; } } if (!state.sampleList.Any()) { processState |= sampleAcceptAndPrepareStates.sampleListStillEmpty; var ace = new aceGeneralException("Sample creation failed: " + processState.ToString(), null, this, "Sample import failed :: "); throw ace; return; } int AddedSampleCount = state.sampleList.Count - startSampleCount; console.log("Added to the sample list [" + AddedSampleCount + "] at current job record [" + state.aRecord.job.name + "]", true); // ============================================================================================== if (!filepath.isNullOrEmpty()) // -------------------------- EXPORTING INTO LOCAL XML FILE { processState |= sampleAcceptAndPrepareStates.sampleListExported; var fi = saveSample(filepath, state.sampleList); console.output.log("Sample list exported to: " + fi.Name); state.sampleFile = fi.Name; } console.output.AppendLine("--- loged sample import procedure states: [" + processState.ToString() + "]"); }