public fileunit toNewOutput(List <String> content, String name, String extension = "txt")
        {
            fileunit funit = getNewOutput(name, extension);

            funit.setContentLines(content);
            return(funit);
        }
        /// <summary>
        /// Loads the table from the specified filepath
        /// </summary>
        /// <param name="filepath">The filepath.</param>
        public void Load(string filepath, ILogBuilder loger)
        {
            sourceFile = new fileunit(filepath, true);
            int cl = sourceFile.contentLines.Count;
            loger.log("Lexicon terms coding twins definitions: " + cl);

            int lind = 0;
            int lmax = cl / 20;
            lind = lmax;
            LoadCount = 0;
            foreach (string ln in sourceFile.contentLines)
            {
                lind--;

                SetEntryFromString(ln);
                LoadCount++;
                if (lind <= 0)
                {
                    lind = lmax;
                    loger.log("Coding twins loaded: " + LoadCount);
                }
            }

            loger.log("Coding twins completly loaded: " + LoadCount);
        }
        /// <summary>
        /// Saves the output.
        /// </summary>
        /// <param name="file">The file.</param>
        public void saveOutput(fileunit file)
        {
            String filename = Path.GetFileName(file.path);
            String path     = folder[aceCCFolders.output].pathFor(filename);

            file.getContent().saveStringToFile(path, getWritableFileMode.autoRenameExistingOnOtherDate);
        }
示例#4
0
        private void reportTarget(spiderTarget t, folderNode fn, int c)
        {
            string     pageFolder = "P" + c.ToString("D3") + "_" + t.IsRelevant.ToString();
            folderNode pfn        = fn.Add(pageFolder, "Page " + c.ToString(), "Report on page " + t.url + " crawled by " + name + ". Target.IsRelevant: " + t.IsRelevant + ".".addLine(pageDescription));

            fileunit content = new fileunit(pfn.pathFor("content.txt"), false);
            fileunit links   = new fileunit(pfn.pathFor("links.txt"), false);

            if (t.evaluation != null)
            {
                t.evaluation.saveObjectToXML(pfn.pathFor("relevance.xml"));
            }

            content.setContent(t.pageText);
            //t.page.relationship.outflowLinks
            if (t.page != null)
            {
                foreach (spiderLink ln in t.page.relationship.outflowLinks.items.Values)
                {
                    string rl = ln.url;

                    links.Append(ln.url);
                }

                //t.page.webpage.links.ForEach(x => links.Append(x.nature + " | " + x.name + " | " + x.url));
            }
            content.Save();
            links.Save();
            //  marks.Save();
        }
示例#5
0
        public void saveOutput(List <string> content, string path)
        {
            fileunit tLog = new fileunit(path, false);

            tLog.setContentLines(content);
            tLog.Save();
        }
        public fileunit loadInput(String filename)
        {
            String path = folder[aceCCFolders.input].pathFor(filename);
            var    unit = new fileunit(path);

            loadedInputs.AddUnique(unit.info.FullName);
            return(unit);
        }
示例#7
0
 public void GenerateCumulative(int iterations, folderNode folder, string sufix, string prefix = "IT")
 {
     for (int i = 0; i < iterations; i++)
     {
         fileunit iteration = new fileunit(folder.pathFor(prefix + i.ToString("D3").add(sufix, "_").ensureEndsWith(".txt")), false);
         foreach (var p in items)
         {
             iteration.AppendUnique(p.Value[i]);
         }
         iteration.Save();
     }
 }
示例#8
0
        public morphMachine(folderNode node)
        {
            folder = node;

            string file = folder.findFile("lexiconCache_negatives.txt", SearchOption.AllDirectories, false);

            inputFile = new fileunit(file, true);

            string ignorePath = folder.pathFor("morphMachine_ignore.txt");

            ignoreFile = new fileunit(ignorePath, true);

            SetRuleSets();
        }
#pragma warning disable CS1574 // XML comment has cref attribute 'dataException' that could not be resolved
        /// <summary>
        /// Appends the file with the entries of the table
        /// </summary>
        /// <param name="filepath">The filepath.</param>
        /// <exception cref="aceCommonTypes.core.exceptions.dataException">The source file was never defined - cant use save without filepath - null - Save() failed, no filepath</exception>
        public void Append(string filepath = null)
#pragma warning restore CS1574 // XML comment has cref attribute 'dataException' that could not be resolved
        {
            if (sourceFile == null)
            {
                if (imbSciStringExtensions.isNullOrEmptyString(filepath)) throw new dataException("The source file was never defined - cant use save without filepath", null, this, "Save() failed, no filepath");

                sourceFile = new fileunit(filepath, true);
            }

            List<string> entryList = GetEntriesAsStringPairs();
            sourceFile.AppendUnique(entryList);
            sourceFile.Save();
        }
        /// <summary>
        /// Prepares the specified loger.
        /// </summary>
        /// <param name="loger">The loger.</param>
        public void prepare(ILogBuilder loger, folderNode __folder)
        {
            folder = __folder;

            failList       = new fileunit(folder.pathFor(FILE_FAILLIST), true);
            domainFailList = new fileunit(folder.pathFor(FILE_DOMAINFAILLIST), true);
            duplicateList  = new fileunit(folder.pathFor(FILE_DUPLICATE), true);

            foreach (string ln in duplicateList.contentLines)
            {
                var sp = ln.Split(new string[] { "|||" }, StringSplitOptions.RemoveEmptyEntries);
                duplicates.TryAdd(sp[0], sp[1]);
            }
        }
        protected void LoadCacheFiles(ILogBuilder loger, semanticLexiconContext context)
        {
            failedQueries = new fileunit(folder.pathFor("lexiconCache_negatives.txt"), true);

            loger.log("Negative queries loaded");
            AddTemp(failedQueries.contentLines, loger, true, true);
            loger.log("Loading encoding twins");
            twins.Load(twinsSavePath, loger);
            loger.log("Encoding twins loaded");

            if (twins.Count == 0)
            {
                rebuildEncodedTwins(loger, context);
            }

            failedQueries.Save();
        }
示例#12
0
        public void reportCrawler(modelSpiderTestRecord tRecord)
        {
            folderNode fn = folder[DRFolderEnum.crawler];

            string fileprefix = tRecord.instance.name.getCleanFilePath(); //tRecord.name.getCleanFilepath();

            if (REPORT_TIMELINE)
            {
                DataTable timeline = timeSeries.GetAggregatedTable("frontier_stats", dataPointAggregationAspect.overlapMultiTable); //.GetSumTable("timeline_" + fileprefix.Replace(" ", ""));
                timeline.GetReportAndSave(folder[DRFolderEnum.crawler], notation, "frontier_stats" + fileprefix);
            }


            if (REPORT_ITERATION_URLS)
            {
                tRecord.allUrls         = urlsLoaded.GetAllUnique();
                tRecord.allDetectedUrls = urlsDetected.GetAllUnique();

                saveOutput(tRecord.allDetectedUrls, folder[DRFolderEnum.crawler].pathFor("urls_detected.txt"));
                saveOutput(tRecord.allUrls, folder[DRFolderEnum.crawler].pathFor("urls_loaded.txt"));
                saveOutput(tRecord.relevantPages, folder[DRFolderEnum.crawler].pathFor("urls_relevant_loaded.txt"));
            }
            //    Int32 iterations = tRecord.instance.settings.limitIterations;


            DataTable cpuTable  = tRecord.cpuTaker.GetDataTableBase("cpuMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "cpu_" + fileprefix);
            DataTable dataTable = tRecord.dataLoadTaker.GetDataTableBase("dataLoadMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "dataload_" + fileprefix);

            DataTable resourcesTable = tRecord.measureTaker.GetDataTableBase("resourceMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "resource_" + fileprefix);



            if (imbWEMManager.settings.directReportEngine.doPublishPerformance)
            {
                tRecord.performance.folderName = folder.name;
                tRecord.performance.deploy(tRecord);

                tRecord.performance.saveObjectToXML(folder[DRFolderEnum.crawler].pathFor("performance.xml"));

                DataTable pTable = tRecord.performance.GetDataTable(true).GetReportAndSave(folder, notation, "crawler_performance" + fileprefix);
            }

            tRecord.lastDomainIterationTable.GetDataTable(null, imbWEMManager.index.experimentEntry.CrawlID).GetReportAndSave(folder, notation, "DLCs_performance_" + fileprefix);

            tRecord.reporter = this;


            signature.deployReport(tRecord);
            //signature.notation = notation;
            signature.saveObjectToXML(folder.pathFor("signature.xml"));



            folder.generateReadmeFiles(notation);

            fileunit tLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false);

            tLog.setContent(tRecord.logBuilder.ContentToString(true));
            tLog.Save();

            tRecord.instance.reportCrawlFinished(this, tRecord);


            aceLog.consoleControl.setLogFileWriter();
        }
示例#13
0
        /// <summary>
        /// Runs when a DLC is finished
        /// </summary>
        /// <param name="wRecord">The w record.</param>
        public void reportDomainFinished(modelSpiderSiteRecord wRecord)
        {
            folderNode fn = null;

            string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath();



            if (imbWEMManager.settings.directReportEngine.doDomainReport)
            {
                fn = folder[DRFolderEnum.sites].Add(wRecord.domainInfo.domainRootName.getCleanFilepath(), "Report on " + wRecord.domainInfo.domainName, "Records on domain " + wRecord.domainInfo.domainName + " crawled by " + name);

                if (REPORT_DOMAIN_TERMS)
                {
                    if (wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        if (wRecord.context.targets.dlTargetPageTokens != null)
                        {
                            wRecord.context.targets.dlTargetPageTokens.GetDataSet(true).serializeDataSet("token_ptkn", fn, dataTableExportEnum.excel, notation);
                        }
                    }
                    if (wRecord.context.targets.dlTargetLinkTokens != null)
                    {
                        wRecord.context.targets.dlTargetLinkTokens.GetDataSet(true).serializeDataSet("token_ltkn", fn, dataTableExportEnum.excel, notation);
                    }
                }

                if (REPORT_DOMAIN_PAGES)
                {
                    int c = 1;
                    foreach (spiderTarget t in wRecord.context.targets.GetLoadedInOrderOfLoad())
                    {
                        reportTarget(t, fn, c);
                        c++;
                    }
                }



                fileunit wLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false);
                wLog.setContent(wRecord.logBuilder.ContentToString(true));

                wLog.Save();


                if (REPORT_ITERATION_URLS)
                {
                    textByIteration url_loaded   = urlsLoaded[wRecord];   //.GetOrAdd(wRecord, new textByIteration());
                    textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration());


                    fileunit url_ld_out  = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_ld.txt"), false);
                    fileunit url_dt_out  = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_dt.txt"), false);
                    fileunit url_srb_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_srb_ld.txt"), false);

                    url_ld_out.setContentLines(url_loaded.GetAllUnique());
                    url_dt_out.setContentLines(url_detected.GetAllUnique());
                    url_srb_out.setContentLines(wRecord.relevantPages);

                    url_ld_out.Save();
                    url_dt_out.Save();
                    url_srb_out.Save();
                }

                //terms_out.Save();
                //sentence_out.Save();
            }

            if (REPORT_MODULES)
            {
                if (wRecord.tRecord.instance is spiderModularEvaluatorBase)
                {
                    wRecord.frontierDLC.reportDomainOut(wRecord, fn, fileprefix);
                }
            }

            if (REPORT_TIMELINE)
            {
                wRecord.iterationTableRecord.GetDataTable(null, "iteration_performace_" + fileprefix).GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_performace_" + fileprefix); //, notation);
            }

            //if (REPORT_TIMELINE)
            //{
            //    DataTable dt = wRecord.GetTimeSeriesPerformance();
            //    timeSeries.Add(dt);
            //    dt.GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_frontier_stats_" + fileprefix);
            //}


            wRecord.tRecord.lastDomainIterationTable.Add(wRecord.iterationTableRecord.GetLastEntryTouched());

            wRecord.tRecord.instance.reportDomainFinished(this, wRecord);

            wRecord.Dispose();
        }
示例#14
0
        public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator)
        {
            iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord);

            wRecord.iterationTableRecord.Add(ip_record);


            folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName);


            if (imbWEMManager.settings.directReportEngine.doIterationReport)
            {
                if (imbWEMManager.settings.directReportEngine.doDomainReport)
                {
                    fn = getIterationFolder(dataUnit.iteration, wRecord);
                    if (REPORT_WRECORD_LOG)
                    {
                        wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt"));
                    }



                    string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath();


                    textByIteration url_loaded   = urlsLoaded[wRecord];   //.GetOrAdd(wRecord, new textByIteration());
                    textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration());
                                                                          //textByIteration terms_ext = termsExtracted[wRecord];
                                                                          //textByIteration sentence_ext = sentencesExtracted[wRecord];



                    if (REPORT_MODULES)
                    {
                        if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration)
                        {
                            if (wRecord.tRecord.instance is spiderModularEvaluatorBase)
                            {
                                wRecord.frontierDLC.reportIterationOut(wRecord, fn);
                            }
                        }
                    }

                    string its = dataUnit.iteration.ToString("D3");


                    //DataTable dt = wRecord.context.targets.GetDataTable();
                    //dt.SetTitle(fileprefix + "_targets");
                    //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation);

                    //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList());

                    //if (REPORT_ITERATION_TERMS)
                    //{
                    //    fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false);


                    //    blocks.setContentLines(sentence_ext[dataUnit.iteration]);

                    //    blocks.Save();
                    //}

                    if (REPORT_TIMELINE)
                    {
                        objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml"));
                    }



                    if (REPORT_ITERATION_URLS)
                    {
                        if (wRecord.iteration > 0)
                        {
                            builderForMarkdown now_loaded = new builderForMarkdown();

                            //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false);
                            List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1);

                            int tc = 0;
                            foreach (spiderTarget t in targets_loaded)
                            {
                                reportTarget(t, fn, tc);
                                now_loaded.AppendLine(t.url);
                                now_loaded.AppendHorizontalLine();
                                now_loaded.Append(t.marks.GetActiveResults());
                                now_loaded.AppendHorizontalLine();
                                now_loaded.Append(t.marks.GetPassiveResults());
                                now_loaded.AppendHorizontalLine();

                                var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name);
                                dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow");

                                now_loaded.AppendTable(dt, false);

                                tc++;
                            }

                            now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt"));


                            spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1];
                            loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation);
                        }



                        fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false);
                        fileunit loaded   = new fileunit(fn.pathFor(its + "_ld.txt"), false);

                        fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false);
                        relp.Append(wRecord.relevantPages, true);

                        foreach (spiderTarget t in wRecord.context.targets)
                        {
                            if (t.page != null)
                            {
                                //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash));

                                loaded.Append(t.url);
                                url_loaded[dataUnit.iteration].Add(t.url);
                            }
                            else
                            {
                                detected.Append(t.url);
                                url_detected[dataUnit.iteration].Add(t.url);
                            }
                        }


                        string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine;

                        fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false);
                        int      c      = 1;

                        foreach (var lnk in wRecord.web.webActiveLinks)
                        {
                            active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score));
                            active.Append(lnk.marks.GetLayerAssociation());
                            c++;
                        }


                        detected.Save();
                        loaded.Save();
                        active.Save();
                    }
                }
            }



            wRecord.tRecord.instance.reportIteration(this, wRecord);
        }