示例#1
0
        ///// <summary>
        ///// Sets the site tf compiled.
        ///// </summary>
        ///// <param name="compiledTF">The compiled tf.</param>
        //public void SetSiteTFCompiled(termDocumentSet compiledTF, String domain)
        //{
        //    indexDomain idomain = GetDomain(domain);


        //    //objectSerialization.saveObjectToXML(compiledTF.GetAggregateDataTable(), GetTFFileName(idomain));

        //    compiledTF.AggregateDocument.name = idomain.HashCode;

        //    /*
        //    IWeightTable domainDocument = imbWEMManager.index.experimentManager.globalTFIDFSet.AddTable(idomain.HashCode);

        //    domainDocument.AddExternalDocument(compiledTF.AggregateDocument, true);
        //    */


        //    AddOrUpdate(domain);
        //}

        ///// <summary>
        ///// Gets the site tf for build.
        ///// </summary>
        ///// <param name="domainName">Name of the domain.</param>
        ///// <returns></returns>
        //public webSitePageTFSet GetSiteTFForBuild(String domainName)
        //{
        //    indexDomain idomain = GetDomain(domainName);
        //    String path = GetTFFileName(idomain);
        //    webSitePageTFSet output = new webSitePageTFSet(domainName, "TFIDF table set for this domain");
        //    return output;
        //}

/*
 *      /// <summary>
 *      /// Provides feedback if the precompiled TF-IDF exists
 *      /// </summary>
 *      /// <param name="domainName">Name of the domain.</param>
 *      /// <returns></returns>
 *      public Boolean GetSiteLemmaTFExists(String domainName)
 *      {
 *          indexDomain idomain = GetDomain(domainName);
 *
 *
 *
 *          //String file = tf_folder.findFile("*" + idomain.HashCode + "*.xml");
 *          idomain.TFIDFcompiled = !file.isNullOrEmpty();
 *          AddOrUpdate(idomain);
 *
 *          return idomain.TFIDFcompiled;
 *      }
 */
        ///// <summary>
        ///// Gets the site lemma tf for use.
        ///// </summary>
        ///// <param name="domainName">Name of the domain.</param>
        ///// <returns></returns>
        //public weightTableCompiled GetSiteLemmaTFForUse(String domainName)
        //{
        //    indexDomain idomain = GetDomain(domainName);

        //    FileInfo file =  //tf_folder.findFile("*" + idomain.HashCode + "*.xml");

        //    Boolean found = !file.isNullOrEmpty();
        //    if (found)
        //    {
        //        weightTableCompiled tf_lemmaCompiled = new weightTableCompiled(file, true, domainName);

        //        return tf_lemmaCompiled;
        //    } else
        //    {
        //        //String path = GetTFFileName(idomain);

        //        //bSitePageTFSet output = new webSitePageTFSet(domainName, "");
        //        return null;
        //    }
        //}


        /// <summary>
        /// Gets the domain urls.
        /// </summary>
        /// <param name="contentType">Type of the content.</param>
        /// <returns></returns>
        public List <string> GetDomainUrls(indexDomainContentEnum contentType)
        {
            List <indexDomain> output = new List <indexDomain>();
            List <string>      urls   = new List <string>();

            foreach (indexDomain iDomain in this)
            {
                if (iDomain.contentType == contentType)
                {
                    urls.Add(iDomain.url);
                }
            }
            return(urls);
        }
示例#2
0
        /// <summary>
        /// Gets the domains having specified contentType set
        /// </summary>
        /// <param name="contentType">Type of the content.</param>
        /// <returns></returns>
        public List <indexDomain> GetDomains(indexDomainContentEnum contentType)
        {
            List <indexDomain> output = new List <indexDomain>();


            foreach (indexDomain iDomain in this)
            {
                if (contentType == indexDomainContentEnum.any)
                {
                    output.Add(iDomain);
                }
                else
                {
                    if (iDomain.contentType == contentType)
                    {
                        output.Add(iDomain);
                    }
                }
            }
            return(output);
        }
示例#3
0
        /// <summary>
        /// Performs domain index assertion
        /// </summary>
        /// <param name="domainList">The domain list.</param>
        /// <param name="completeRecheck">if set to <c>true</c> [complete recheck].</param>
        /// <returns></returns>
        public indexDomainAssertionResult GetDomainIndexAssertion(List <string> domainList = null, bool completeRecheck = false)
        {
            indexDomainAssertionResult output = new indexDomainAssertionResult();
            List <indexDomain>         iList  = GetList();

            if (domainList == null)
            {
                domainList = new List <string>();
            }
            if (!domainList.Any())
            {
                iList.ForEach(x => domainList.Add(x.domain));

                //domainList = GetDomainUrls(indexDomainContentEnum.indexed);
            }

            double IPs    = 0;
            int    Lemmas = 0;



            foreach (string domainUrl in domainList)
            {
                indexDomainContentEnum flags = indexDomainContentEnum.none;

                indexDomain idomain = GetDomain(domainUrl);
                if (idomain == null)
                {
                    output.Add(flags, domainUrl);
                    continue;
                }
                else
                {
                    flags = indexDomainContentEnum.indexed;

                    List <indexPage> pageList = imbWEMManager.index.pageIndexTable.GetPagesForDomain(domainUrl);
                    if (completeRecheck)
                    {
                        idomain.recheck(pageList);
                    }


                    List <string> pageUrls = new List <string>();
                    pageList.ForEach(x => pageUrls.Add(x.url));
                    indexURLAssertionResult pageListResult = imbWEMManager.index.pageIndexTable.GetUrlAssertion(pageUrls);

                    if (pageListResult[indexPageEvaluationEntryState.inTheIndex].Count() == pageListResult[indexPageEvaluationEntryState.haveEvaluationEntry].Count())
                    {
                        flags |= indexDomainContentEnum.completeEvaluationPages;
                    }
                    else
                    {
                        flags |= indexDomainContentEnum.uncompleteEvaluationPages;
                    }

                    FileInfo dlc_tf_idf = imbWEMManager.index.experimentManager.CurrentSession.GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);

                    if (dlc_tf_idf.Exists)
                    {
                        flags |= indexDomainContentEnum.uncompleteDomainTFIDF;

                        idomain.TFIDFcompiled = false;
                    }
                    else
                    {
                        flags |= indexDomainContentEnum.completeDomainTFIDF;

                        idomain.TFIDFcompiled = true;
                    }

                    bool appUncomplete = false;

                    double IPd = 0;
                    foreach (indexPage p in pageList)
                    {
                        if ((p.Lemmas == 0) && (p.InfoPrize == 0) && (p.DistinctLemmas.isNullOrEmpty()))
                        {
                            appUncomplete = true;
                        }
                        //IPd += p.InfoPrize;
                    }
                    if (appUncomplete)
                    {
                        flags |= indexDomainContentEnum.uncompleteTFDFApplicationToPages;
                    }
                    else
                    {
                        flags |= indexDomainContentEnum.completeTFDFApplicationToPages;
                    }

                    // idomain.InfoPrize = IPd;

                    output.Add(flags, domainUrl);

                    // AddOrUpdate(idomain);
                }
            }

            return(output);
        }
示例#4
0
        /// <summary>
        /// Samples the accept and prepare -- central sample set operation
        /// </summary>
        /// <param name="filepath">The filepath.</param>
        /// <param name="fileHasPriority">if set to <c>true</c> [file has priority].</param>
        /// <param name="group_tags">The group tags.</param>
        /// <param name="limit">The limit.</param>
        /// <param name="skip">The skip.</param>
        /// <param name="fromDomainIndex">Index of from domain.</param>
        /// <param name="fromPageIndex">Index of from page.</param>
        /// <param name="samplefilename">The samplefilename.</param>
        public void sampleAcceptAndPrepare(string filepath       = "", bool fileHasPriority                  = false, string group_tags = "", int limit = 0,
                                           int skip              = 0, indexDomainContentEnum fromDomainIndex = indexDomainContentEnum.none, indexPageEvaluationEntryState fromPageIndex = indexPageEvaluationEntryState.none,
                                           string samplefilename = "")
        {
            List <string> domains = new List <string>();
            List <string> pages   = new List <string>();
            sampleAcceptAndPrepareStates processState = sampleAcceptAndPrepareStates.started;

            analyticConsoleState state = console.state as analyticConsoleState;

            string sourcePath = filepath;

            if (!filepath.isNullOrEmpty())
            {
                processState |= sampleAcceptAndPrepareStates.filepathArgumentSupplied;
                sourcePath    = folder.findFile(filepath, SearchOption.AllDirectories, false);
                if (sourcePath.isNullOrEmpty())
                {
                    processState |= sampleAcceptAndPrepareStates.sourcePathDiscovered;
                }
                else
                {
                    processState |= sampleAcceptAndPrepareStates.sourcePathNOTFOUND;
                }
            }

            if (state.aRecord == null)
            {
                console.log("You should define job before calling this command!", true);
                return;
            }

            int startSampleCount = 0;

            if (state.sampleList != null)
            {
                startSampleCount = state.sampleList.Count();
                processState    |= sampleAcceptAndPrepareStates.existingSampleDetected;
            }
            else
            {
                state.sampleList = new webSiteSimpleSample();
            }


            state.sampleTags = group_tags;
            state.sampleFile = filepath;
            console.response.log("Sample with group_tags=" + group_tags + ", samplename=" + filepath + ", fileHasPriority=" + fileHasPriority + ".");

            // ==============================================================================================
            if (!state.sampleList.Any())
            {
                if (fromPageIndex != indexPageEvaluationEntryState.none) // -------------------------- LOADING FROM THE PAGE INDEX
                {
                    processState |= sampleAcceptAndPrepareStates.fromPageIndexImportCalled;
                    List <indexDomain> dSample = new List <indexDomain>();

                    var pageList = imbWEMManager.index.pageIndexTable.GetPagesAndDomains(fromPageIndex, out dSample);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        dSample.Randomize();
                    }
                    state.sampleList.Add(dSample);
                }
            }

            // ==============================================================================================
            if (!state.sampleList.Any())
            {
                if (fromDomainIndex != indexDomainContentEnum.none) // -------------------------- LOADING FROM THE DOMAIN INDEX
                {
                    processState |= sampleAcceptAndPrepareStates.fromDomainIndexImportCalled;
                    var list = imbWEMManager.index.domainIndexTable.GetDomainUrls(fromDomainIndex);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        list.Randomize();
                    }
                    foreach (string str in list)
                    {
                        state.sampleList.Add(str);
                    }
                }
            }

            if (!state.sampleList.Any())  // -------------------------- LOADING THE EXTERNAL SAMPLE FILE
            {
                if (!sourcePath.isNullOrEmpty())
                {
                    processState |= sampleAcceptAndPrepareStates.filesourceFoundForImportFromFile;
                    var domainList = sourcePath.openFileToList(true);
                    if (imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake)
                    {
                        domainList.Randomize();
                    }
                    state.sampleList.Add(domainList, skip, limit);
                    console.response.log("Sample external file list [" + samplefilename + "] found at [" + sourcePath + "] containing [" + domainList.Count() + "] domains.");
                }
                else
                {
                    processState |= sampleAcceptAndPrepareStates.filesourceNOTFOUND_ForImportFromFile;
                }
            }

            if (!state.sampleList.Any()) // -------------- LOADING THE INTERNAL SAMPLE FILE
            {
                if (!filepath.isNullOrEmptyString())
                {
                    if (fileHasPriority && sampleExist(filepath))
                    {
                        processState    |= sampleAcceptAndPrepareStates.internalSampleFilesourceLoaded;
                        state.sampleList = loadSample(filepath, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake);
                    }
                    else
                    {
                        if (state.sciProject != null) // -------------------------- LOADING FROM THE DATABASE
                        {
                            if (!group_tags.isNullOrEmpty())
                            {
                                processState |= sampleAcceptAndPrepareStates.groupTagsSpecified_databaseImportCalled;
                                //  state.sampleList = state.sciProject.getSamples(group_tags.getTokens(), limit, "stamp", 0, imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleTake);
                            }
                        }
                        else
                        {
                            processState |= sampleAcceptAndPrepareStates.noSciProjectFound;
                        }
                    }
                }
                else
                {
                    // sample = state.sampleList;
                }
            }

            if (!state.sampleList.Any())
            {
                processState |= sampleAcceptAndPrepareStates.sampleListStillEmpty;
                var ace = new aceGeneralException("Sample creation failed: " + processState.ToString(), null, this, "Sample import failed :: ");
                throw ace;
                return;
            }

            int AddedSampleCount = state.sampleList.Count - startSampleCount;

            console.log("Added to the sample list [" + AddedSampleCount + "] at current job record [" + state.aRecord.job.name + "]", true);


            // ==============================================================================================

            if (!filepath.isNullOrEmpty()) // -------------------------- EXPORTING INTO LOCAL XML FILE
            {
                processState |= sampleAcceptAndPrepareStates.sampleListExported;
                var fi = saveSample(filepath, state.sampleList);
                console.output.log("Sample list exported to: " + fi.Name);
                state.sampleFile = fi.Name;
            }

            console.output.AppendLine("--- loged sample import procedure states: [" + processState.ToString() + "]");
        }