/// <summary> /// Renders the specified set of WebSiteDocuments into List of <see cref="TextDocumentSet"/>s /// </summary> /// <param name="input">The input.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public List <TextDocumentSet> RenderDocumentSet(WebSiteDocumentsSet input, ILogBuilder logger) { List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>(); Int32 target = input.Count; Int32 ti = 0; foreach (WebSiteDocuments webSite in input) { //if (GroupSiteDocuments) //{ TextDocumentSet textSet = RenderSiteDocuments(webSite, logger); textSetForLabel.Add(textSet); //} else //{ // foreach (WebSiteDocument webPage in webSite.documents) // { // TextDocumentSet textSet = new TextDocumentSet(webPage.AssociatedID); // TextDocumentLayerCollection pg = RenderText(webPage, webSite); // pg.name = webPage.AssociatedID; // textSet.Add(pg); // textSetForLabel.Add(textSet); // } //} ti++; Double done = ti.GetRatio(target); logger.Append(" [" + done.ToString("P2") + "] "); } return(textSetForLabel); }
/// <summary> /// Features the vector construction. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void FeatureVectorConstruction(OperationContext context, ILogBuilder log) { // deploying feature vector space constructor featureSpaceConstructor.Deploy(constructorSettings, context.vectorSpace); featureSpaceConstructor.Deploy(constructorSettings, context.SelectedFeatures.GetKeys()); Int32 i = 0; Int32 s = 100; foreach (IVector vector in context.vectorSpace.documents) { var lab = designateSpaceLabel(context, vector); FeatureVector fv = featureSpaceConstructor.ConstructFeatureVector(vector); context.featureVectorByName.Add(vector.name, fv); context.featureSpace.documents.Add(fv); context.featureSpace.labelToDocumentAssociations.Add(fv, lab, 1); if (i % s == 0) { Double r = i.GetRatio(context.spaceModel.documents.Count); log.log("Building feature vectors [" + r.ToString("P2") + "] : [" + i + "/" + context.vectorSpace.documents.Count + "]"); } i++; } log.log("Feature vector construction [" + context.featureSpace.documents.Count + "] done"); //if (context.reportOptions.HasFlag(PlanesReportOptions.report_featureVectors)) //{ // var dt = context.featureSpace.MakeTable(featureSpaceConstructor, "FeatureSpace", "Feature space"); // notes.SaveDataTable(dt, notes.folder_feature); //} }
/// <summary> /// Gets the continual overlap r. /// </summary> /// <param name="A">a.</param> /// <param name="B">The b.</param> /// <returns></returns> private static Double getContinualOverlapR(List <String> A, List <String> B) { Int32 cc = 0; Boolean synced = false; Int32 start = 0; for (int a_i = 0; a_i < A.Count; a_i++) { if (A[a_i] == B.First()) { start = a_i; synced = true; break; } } if (synced) { for (int i = start; i < Math.Min(A.Count, B.Count); i++) { if (A[i] == B[i]) { cc++; } else { break; } } } return(cc.GetRatio(Math.Max(A.Count, B.Count))); }
public List <HtmlNode> AdaptiveRowSelection(HtmlNode divNode, Int32 steps = 5) { Dictionary <Double, List <HtmlNode> > selectionByRatio = new Dictionary <double, List <HtmlNode> >(); HtmlNode head = divNode; Double bestRatio = Double.MinValue; List <HtmlNode> bestSelection = null; for (int i = 0; i < steps; i++) { if (head == null) { break; } if (!head.Name.Equals("div", StringComparison.InvariantCultureIgnoreCase)) { break; } List <HtmlNode> html_tablerows = head.SelectNodesInDepthRange(x => x.Name.Equals(TableSelectionTag, StringComparison.InvariantCultureIgnoreCase), TableSelectionDepthLimit, TableSelectionDepthStart, false); Double rate = 0; Int32 rows = html_tablerows.Count; Int32 columns = Int32.MaxValue; if (html_tablerows.Count > 0) { foreach (var r in html_tablerows) { var html_cells = r.SelectNodesInDepthRange( x => x.Name.Equals(RowSelectionTag, StringComparison.InvariantCultureIgnoreCase) && !x.ChildNodes.Any(y => y.Name.Equals(RowSelectionTag, StringComparison.InvariantCultureIgnoreCase)) , RowSelectionDepthLimit, RowSelectionDepthStart, false); columns = Math.Min(columns, html_cells.Count); } if (columns == Int32.MaxValue) { rate = 0; } else { rate = rows.GetRatio(columns); } } if (!selectionByRatio.ContainsKey(rate)) { selectionByRatio.Add(rate, html_tablerows); } head = head.ParentNode; if (rate > bestRatio) { bestRatio = rate; bestSelection = html_tablerows; } } return(bestSelection); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Category Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); foreach (SpaceLabel label in context.spaceModel.labels) { Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault(); var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label); categoryDictionarties.Add(label.name, c); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); String domainNameLast = ""; Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 20); foreach (var entry in context.items) { i++; WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); FeatureVector fv = new FeatureVector(entry.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; Int32 c = 0; Parallel.ForEach(context.spaceModel.labels, (label) => { var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights); fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity; }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(entry.DomainID).Add(fv, -1); } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation done..."); return(dict); }
public Double GetScore() { Int32 part = 0; Int32 whole = validations.Count; foreach (var pair in validations) { switch (pair.Value.Outcome) { case ValidationOutcome.Invalid: break; case ValidationOutcome.Modified: case ValidationOutcome.Validated: part++; break; case ValidationOutcome.undefined: break; } } //distinct.Count.GetRatio(nGrams.Count); Double score = part.GetRatio(whole, 0, 0); return(score); }
public Double GetRatioForScale(Int32 scaleStep, Double floor = 0, Int32 scaleSteps = -1) { if (scaleSteps == -1) { scaleSteps = xKeys.Count; } Double val = scaleStep.GetRatio(scaleSteps); if (floor == 0) { return(val); } val = val / (1 + floor); val = val + floor; if (val > 1) { val = 1; } //if (scaleStep == 0) return ranger.Minimum; //if (scaleStep == scaleSteps) return ranger.Maximum; //if (ranger == null) return absValue; //absValue = absValue - ranger.Minimum; //absValue = absValue.GetRatio(ranger.Range);// * ranger.Maximum; return(val); }
/// <summary> /// Reduces the document set. /// </summary> /// <param name="docSet">The document set - web site.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns>Rate of reduction</returns> public Double ReduceDocumentSet(WebSiteDocuments docSet, HtmlDocumentReductionSettings settings, ILogBuilder logger) { Int32 input = 0; Int32 output = 0; foreach (WebSiteDocument document in docSet.documents) { input += document.HTMLSource.Length; String newHtml = ReduceDocument(document.HTMLSource, settings, logger); output += newHtml.Length; document.HTMLSource = newHtml; } Double reduction = output.GetRatio(input); if (settings.logSiteLevel) { logger.AppendLine("[" + docSet.domain + "] reduced to: " + reduction.ToString("P2")); } return(reduction); }
/// <summary> /// Gets the index of the Jaccard index: number of common ngrams divided by number of total unique ngrams /// </summary> /// <param name="ngrams_A">The ngrams a.</param> /// <param name="ngrams_B">The ngrams b.</param> /// <returns></returns> public Double GetJaccardIndex(List <T> ngrams_A, List <T> ngrams_B) { List <T> allNGrams = GetJoinElements(ngrams_A, ngrams_B); Int32 common = CountContains(ngrams_A, ngrams_B); // ngrams_A.Count(x => Contains(ngrams_b, x)); // ngrams_b.Contains(x)); return(common.GetRatio(allNGrams.Count)); }
public ITextRender Report(ITextRender output = null) { if (output == null) { output = new builderForMarkdown(); } var scores = items.Select(x => x.score); output.AppendHeading("Granularity"); var distinct = items.GetDistinctScores(); Int32 dC = distinct.Count(); output.AppendPair("Distinct", dC); output.AppendPair("Entries", scores.Count()); Double r = (Double)dC.GetRatio(scores.Count()); output.AppendPair("Distinct / Entries", r); output.AppendHeading("Cumulative histogram"); for (int i = 1; i < 11; i++) { Double l_min = (i - 1).GetRatio(10); Double l_max = i.GetRatio(10); var bin = scores.Where(x => (x > l_min) && (x < l_max)); Double per = bin.Count().GetRatio(scores.Count()); output.AppendPair("Bin [" + i + "][" + l_max.ToString("F2") + "]", per.ToString("P2")); } output.AppendHeading("Descriptive statistics"); DescriptiveStatistics desc = scores.GetStatistics(true); desc.Describe(output); output.AppendHeading("Document selection result"); foreach (DocumentSelectResultEntry result in items) { output.AppendLine(result.score.ToString("F5") + "\t\t" + result.AssignedID); } output.AppendHorizontalLine(); query.Describe(output); output.AppendHorizontalLine(); return(output); }
/// <summary> /// Gets the index of the Jaccard index: number of common ngrams divided by number of total unique ngrams /// </summary> /// <param name="ngrams_A">The ngrams a.</param> /// <param name="ngrams_b">The ngrams b.</param> /// <returns></returns> public static Double GetJaccardIndex(List <String> ngrams_A, List <String> ngrams_b) { List <String> allNGrams = new List <string>(); Int32 common = ngrams_A.Count(x => ngrams_b.Contains(x)); allNGrams.AddRange(ngrams_A); allNGrams.AddRange(ngrams_b, true); return(common.GetRatio(allNGrams.Count)); }
/// <summary> /// Builds vectors from selected features and feature weighting model /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false) { List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList(); //FV.AddRange(); log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "]"); // preparing the model weightModel.PrepareTheModel(context.spaceModel, log); Int32 i = 0; Int32 s = context.spaceModel.documents.Count / 10; // building document VSM foreach (SpaceDocumentModel docModel in context.spaceModel.documents) { var wd = weightModel.GetWeights(FV, docModel, context.spaceModel); VectorDocument docVec = new VectorDocument(docModel.name); docVec.terms = wd; context.vectorSpace.documents.Add(docVec); if (i % s == 0) { Double r = i.GetRatio(context.spaceModel.documents.Count); log.log("[" + r.ToString("F2") + "]"); } i++; } if (constructCategories) { // logger.log(":: Creating VectorSpace instances for categories"); // building category VSM foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { var wd = weightModel.GetWeights(FV, catModel, context.spaceModel); VectorLabel catVec = new VectorLabel(catModel.name); catVec.terms = wd; context.vectorSpace.labels.Add(catVec); } } }
/// <summary> /// Returns rate and which the document fits the fingerprint /// </summary> /// <param name="document">The document.</param> /// <returns></returns> public Double Evaluate(HtmlNode document) { Int32 m = 0; foreach (String xPath in XPathList) { var node = document.SelectSingleNode(xPath); if (node == null) { } else { m++; } } return(m.GetRatio(XPathList.Count)); }
/// <summary> /// Renders HeatMapModel of specified size /// </summary> /// <param name="width">The width.</param> /// <param name="height">The height.</param> /// <returns></returns> public HeatMapModel MakeHeatMap(Int32 width, Int32 height, Int32 xPeriod = 20, Int32 yPeriod = 20) { xAxisFunction.outputRange = new imbNumberScale(numberRangePresetEnum.zeroToOne); yAxisFunction.outputRange = new imbNumberScale(numberRangePresetEnum.zeroToOne); HeatMapModel map = HeatMapModel.Create(width, height, "D3"); map.AllocateSize(width, height); for (Int32 y = 0; y < height; y++) { Double yValue = yAxisFunction.GetOutput(y.GetRatio(yPeriod)); for (Int32 x = 0; x < width; x++) { map[x, y] = xAxisFunction.GetOutput(x.GetRatio(xPeriod)) + yValue; //grayImage.GetIntesity(x, y, bs); } } return(map); }
/// <summary> /// Reduces the dataset, returns total reduction score (%) /// </summary> /// <param name="dataSet">The data set.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public double ReduceDataset(IEnumerable <WebSiteDocumentsSet> dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger) { logOutputStart = logger.Length; if (!dataSet.Any()) { throw new ArgumentException("The specified dataset is empty!", nameof(dataSet)); } List <Double> reductions = new List <double>(); List <Double> html_reductions = new List <double>(); Int32 total_input = 0; // dataSet.CountDocumentsTotal(); Int32 total_output = 0; foreach (WebSiteDocumentsSet category in dataSet) { total_input += category.CountDocumentsTotal(); reductions.Add(ReduceDatasetCategory(category, settings, logger)); html_reductions.Add(htmlEngine.ReduceDatasetCategory(category, settings.HtmlDocumentReduction, logger)); total_output += category.CountDocumentsTotal(); // reductions.Add(ReduceDatasetCategory(category, settings, logger)); } Double average = reductions.Average(); Double reduction = total_output.GetRatio(total_input); Double average_html = html_reductions.Average(); reductionScore = (average_html * reduction); logger.log("Dataset document count reduced: " + reduction.ToString("P2")); logger.log("Dataset document size reduced (avg): " + average_html.ToString("P2")); logger.log("Total reduction score: " + reductionScore.ToString("P2")); return(reductionScore); }
/// <summary> /// Evaluates the ds ranking. /// </summary> /// <param name="ds_loaded">The ds loaded.</param> /// <param name="logger">The logger.</param> /// <param name="filepath">The filepath.</param> /// <param name="minDiversity">The minimum diversity.</param> /// <returns></returns> public static Boolean EvaluateDSRanking(DocumentSelectResult ds_loaded, ILogBuilder logger, String filepath = "", Double minDiversity = 0.01) { if (filepath == "") { filepath = ds_loaded.name; } var distinct = ds_loaded.items.GetDistinctScores(); Int32 c = distinct.Count(); Boolean skip = true; if (distinct.Contains(Double.NaN)) { logger.log("Ranking scores [" + filepath + "] is refused as it contains NaN entries"); return(false); } if (c < 2) { logger.log("Ranking scores [" + filepath + "] is refused as it contains [" + c + "] distinct values"); return(false); } Double rate = c.GetRatio(ds_loaded.items.Count()); if (rate < minDiversity) { logger.log("Ranking scores [" + filepath + "] is refused for having [" + rate.ToString("F5") + "] below criterion [" + minDiversity.ToString("F2") + "]"); return(false); } logger.log("Ranking scores [" + filepath + "] accepted d=[" + rate.ToString("F5") + "] c=[" + distinct.Count + "] |e|=[" + ds_loaded.items.Count + "]"); return(true); }
/// <summary> /// Gets the first name of the significant parent by node. /// </summary> /// <param name="node">The node.</param> /// <param name="significanceLevel">The significance level: rade of occurence, less value more significant/rare the tag is.</param> /// <returns></returns> public HtmlNode GetFirstSignificantParentByNodeName(HtmlNode node, Double significanceLevel = 0.2) { HtmlNode head = node; Double rate = 1; Int32 topFreq = NodeTagCounter.GetTopFrequency(); while (rate > significanceLevel) { if (head.ParentNode == null) { return(head); } Int32 freq = NodeTagCounter.GetFrequencyForItem(head.Name); if (freq == 0) { return(head); } rate = freq.GetRatio(topFreq); head = head.ParentNode; } return(head); }
/// <summary> /// Loads the lexic resource. /// </summary> /// <param name="output">The output.</param> /// <param name="resourceFilePath">The resource file path.</param> public void LoadLexicResource(ILogBuilder output, String resourceFilePath) { List <String> lines = new List <String>(); // <---------------------------------------------- [ if (isLoaded) { return; } String pt = ""; if (!localCache.isNullOrEmpty()) { pt = localCache; lines.AddRange(File.ReadLines(localCache)); } if (lines.Count < 100) { pt = resourceFilePath; lines = new List <string>(); lines.AddRange(File.ReadAllLines(resourceFilePath)); } Int32 i = 0; Int32 iCycle = lines.Count() / 20; Int32 l = lines.Count(); Int32 c = 0; Double p = 0; output.logStartPhase("Loading", "Loading the lexic resource - with mode: " + mode.ToString()); output.log("Start of loading lexic resource [" + pt + "]"); // Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism=1 }, (line) => Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism = 1 }, (line) => // Parallel.ForEach(lines, (line) => { string inflectForm = ""; string lemma = ""; string gramTag = ""; SelectFromLine(line, out inflectForm, out lemma, out gramTag); lexicInflection inflect = null; if (!inflectForm.isNullOrEmpty()) { if (!ContainsKey(inflectForm)) { inflect = new lexicInflection(line); inflect.lemmaForm = lemma; inflect.name = inflectForm; inflect.inflectedForm = inflectForm; inflect.lexicalDefinitionLine = line; if (spellAlternator.IsInitiated) { String altInflectedForm = spellAlternator.ConvertFromAtoB(inflectForm); spellAlternatives.GetOrAdd(altInflectedForm, inflectForm); } Add(inflectForm, inflect); } else { inflect = base[inflectForm]; } lexicGrammarCase gramCase = null; if (mode == textResourceIndexResolveMode.resolveOnLoad) { var gramTagColl = grammTagConverter.ConvertFromString(gramTag); gramCase = inflect.AddGrammarCase(gramTagColl); gramCase.lexicalDefinitionLine = gramTag; } else { gramCase = new lexicGrammarCase(); gramCase.lexicalDefinitionLine = gramTag; gramCase.name = "gc" + i.ToString(); inflect.Add(gramCase); } // <----------------- construction of Lemma centered dictionary lexicGraphSetWithLemma lxSet = null; if (!registratedLemmaIndex.ContainsKey(lemma)) { lock (LemmaIndexLock) { if (!registratedLemmaIndex.ContainsKey(lemma)) { lxSet = new lexicGraphSetWithLemma(); lxSet.lemmaForm = lemma; registratedLemmaIndex.TryAdd(lemma, lxSet); } } } lxSet = registratedLemmaIndex[lemma]; if (!lxSet.ContainsKey(inflectForm)) { lock (SetLock) { if (!lxSet.ContainsKey(inflectForm)) { lxSet.TryAdd(inflect.name, inflect); } } } Interlocked.Increment(ref c); Interlocked.Increment(ref i); if (c > iCycle) { lock (loadStatusLock) { if (c > iCycle) { c = 0; p = i.GetRatio(l); output.AppendLine("Done: _" + p.ToString("P2") + "_"); } } } } }); output.logEndPhase(); output.log("End of loading process"); isLoaded = true; }
public List <String> DescribeSelf(List <String> output = null) { if (output == null) { output = new List <string>(); } output.Add("Experiment [" + experiment.name + "] done in: " + Duration.ToString("F2") + " minutes"); output.Add(context.setup.description); // context.validationCollections.Count output.Add("k-Fold cross validation k[" + experiment.validationSetup.k + "] - RND(T/E)SMP[" + experiment.validationSetup.randomize.ToString() + "] - FVE models [" + experiment.models.Count + "] - Classiffiers [" + experiment.classifiers.Count + "]"); Int32 nCats = 0; Int32 nCases = 0; Double nCasePerCat = 0; foreach (var c in context.classes.GetClasses()) { nCats++; nCases += c.WebSiteSample.Count(); } nCasePerCat = nCases.GetRatio(nCats); output.Add("Categories [" + nCats + "] with [" + nCases + "] -- cases per category [" + nCasePerCat.ToString("F2") + "]"); var model = context.tools.model as pipelineMCRepo.model.mcRepoProcessModel; output.Add("Pages per web site (limit) [" + model.setup.target_languagePagePerSite + "]"); foreach (var m in context.setup.models) { String ln = m.name.TrimToMaxLength(15); foreach (var fv in m.settings.featureVectors.serialization) { if (fv.isActive) { ln = ln.add("[" + fv.name.TrimToMaxLength(10, " ") + "]", " "); } else { ln = ln.add("[" + ("-".Repeat(10)) + "]", " "); } } } output.Add("----"); output.Add("The best classifier per FVE models, by cross k-fold mean of F1 (macro-average): "); output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}]", "Feature Vector Model", "Top class.", "Macro F1")); foreach (var cl in bestPerformingClassifiers) { if (cl == theBestPerformer) { output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}] <-- the best ", cl.Name, cl.Classifier, cl.F1measure)); } else { output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}]", cl.Name, cl.Classifier, cl.F1measure)); } } output.Add("----"); output.Add("The best performer: "); output.Add("Name: " + theBestPerformer.Name); output.Add("Classifier: " + theBestPerformer.Classifier); output.Add("F1 measure: " + theBestPerformer.F1measure.ToString("F5")); output.Add("----"); output.Add("The FVE with highest S1 measure: "); output.Add("Name: " + bestModel.modelName); output.Add("Range width: " + bestModel.RangeWidthAvg.ToString("F5")); output.Add("Range position: " + bestModel.RangePositionAvg.ToString("F5")); output.Add("S1 measure: " + bestModel.S1Measure.ToString("F5")); output.Add("----"); output.Add("Mean classifier performances by FVE models: "); DocumentSetCaseCollectionReport minMean = new DocumentSetCaseCollectionReport(); minMean.F1measure = 1; DocumentSetCaseCollectionReport maxMean = new DocumentSetCaseCollectionReport(); maxMean.F1measure = 0; foreach (var cl in meanPerformanceForExtractors) { if (cl.F1measure <= minMean.F1measure) { minMean = cl; } if (cl.F1measure > maxMean.F1measure) { maxMean = cl; } } foreach (var cl in meanPerformanceForExtractors) { String lb = " --- "; if (cl == minMean) { lb = " min "; } if (cl == maxMean) { lb = " max "; } output.Add(String.Format("[{0,-30}] P[{1,10:F5}] R[{2,10:F5}] F1[{3,10:F5}] [{4,5}]", cl.Name, cl.Precision, cl.Recall, cl.F1measure, lb)); } output.Add(" --- FVE cross-classifier means are computed as quality infication for FVE's configuration"); output.Add(" --- FVE models and k-fold sample distribution MD5 hash"); foreach (var c in valColVsModelVsSampleHash) { output.Add(c); } return(output); }
/// <summary> /// Reduces the dataset category. /// </summary> /// <param name="dataSet">The data set.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger) { //List<Double> reductions = new List<double>(); Int32 total_input = dataSet.CountDocumentsTotal(); List <WebSiteGraphDiagnosticMark> marks = new List <WebSiteGraphDiagnosticMark>(); if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none) { marks = settings.marksToRemove.getEnumListFromFlags <WebSiteGraphDiagnosticMark>(); } List <WebSiteDocuments> toRemove = new List <WebSiteDocuments>(); foreach (WebSiteDocuments site in dataSet) { if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none) { if (site.extensions.graph == null) { if (settings.logSiteLevel) { logger.log("Site _ [" + site.domain + "] _ flaged for removal because not having graph declared"); } } else { foreach (WebSiteGraphDiagnosticMark mark in marks) { if (site.extensions.graph.diagnosticResults.HasFlag(mark)) { if (settings.logSiteLevel) { logger.log("Site _ [" + site.domain + "] _ flaged for removal because of [" + mark.ToString() + "] web graph diagnostic mark"); } toRemove.Add(site); } } } } } foreach (WebSiteDocuments site in toRemove) { if (dataSet.Contains(site)) { dataSet.Remove(site); } } dataSet.RemoveEmptyDocuments(logger, settings.LimitSettings.minPageLimit, settings.LimitSettings.maxPageLimit); Int32 total_output = dataSet.CountDocumentsTotal(); Double average = total_output.GetRatio(total_input); if (settings.logCategoryLevel) { logger.log("Document count in _ [" + dataSet.name + "] _ reduced to: " + average.ToString("P2")); } return(average); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="tableName">Name of the table.</param> /// <param name="parser">The parser.</param> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <returns></returns> protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false) { if (table == null) { table = new webLemmaTermTable(tableName); } if (table.Count > 0) { logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } List <String> tfdfList = counter.GetIndexForms(); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count + 100; if (!tableName.isNullOrEmpty()) { table.name = tableName; } List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count; Int32 cycleLength = startIndex / 5; while (tfdfList.Any()) { String term = tfdfList.FirstOrDefault(); Int32 d = tfdfList.Count; if (term != null) { lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger); d = d - tfdfList.Count; if (d == 0) { table.unresolved.Add(term); tfdfList.Remove(term); d = 1; } else { Boolean ok = true; if (settings.allowedLemmaTypes.Any()) { var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none); if (settings.strictPosTypePolicy) { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { if (tps.Contains(pos_type.V)) { ok = false; } //foreach (pos_type t in tps) //{ // if (!settings.allowedLemmaTypes.Contains(t)) // { // ok = false; // break; // } //} } } else { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { } } } else { } if (ok) { List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); webLemmaTerm lemma = new webLemmaTerm(); lemma.nominalForm = inflectSet.lemmaForm; lemma.name = inflectSet.lemmaForm; Double documentFrequency = 0; Double termFrequency = 0; foreach (lexicInflection inflect in inflectSet.Values) { TFDFContainer cn = counter.GetContainer(inflect.inflectedForm); if (cn != null) { lemma.AFreqPoints += cn.items.Count; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>(); documents.AddUnique(document); imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement; if (docSet != null) { documentSet.AddUnique(docSet); } else { logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")"); } if (cntPair.flagBag.Contains(cnt_containerType.link)) { termFrequency += settings.anchorTextFactor; } else if (cntPair.flagBag.Contains(cnt_containerType.title)) { termFrequency += settings.titleTextFactor; } else { termFrequency += settings.contentTextFactor; } cntPair.AddGraph(inflect); } lemma.otherForms.AddUnique(cn.indexForm); } else { lemma.otherForms.AddUnique(inflect.inflectedForm); } } lemma.documentSetFrequency = documentSet.Count; lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); //table.Add(lemma); } else { } } } li++; i = i + d; c = c + d; d = startIndex - tfdfList.Count; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_"); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]"); break; } } if (settings.doComputeTFIDF) { recompute(table, logger, forSingleWebSite, lemmas); } else { foreach (var le in lemmas) { table.Add(le); } } // table.ReadOnlyMode = true; return(table); }
public static DataRow BuildRow(this DocumentSetCaseCollection host, DocumentSetCase setCase, DataTable output, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true) { var setClass = host.setClass; var validationCase = host.validationCase; DataRow dr = output.NewRow(); dr["name"] = host.validationCase.name + "_" + setCase.subject.name; if (output.Columns.Contains("Origin")) { dr["Origin"] = host.setClass.name; } dr["Case"] = setCase.subject.name; if (!isTrainingCollection) { Int32 cor = 0; foreach (var cl in validationCase.context.setup.classifiers) { String cName = ""; Int32 t = 0; if (setCase.data[cl].selected != null) { cName = setCase.data[cl].selected.name; if (setCase.data[cl].selected.classID == host.rightClassID) { t = 1; } else { t = 0; } } else { cName = "- not set -"; } dr["ClassResultName" + cl.name] = cName; cor += t; dr["EvalTrue" + cl.name] = t; } dr["Correct"] = cor.GetRatio(validationCase.context.setup.classifiers.Count); } foreach (var cl in setCase.data.setClassCollection.GetClasses()) { foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { dr[fv.name + "_" + cl.treeLetterAcronim] = setCase.data.featureVectors[cl.classID][fv]; } } } if (doFVAnalysis) { // aceDictionary2D<String, String, rangeFinder> matrix = new aceDictionary2D<string, string, rangeFinder>(); Dictionary <String, rangeFinderWithData> rangers = new Dictionary <string, rangeFinderWithData>(); foreach (var cl in setCase.data.setClassCollection.GetClasses()) { foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { if (!rangers.ContainsKey(fv.name)) { rangers.Add(fv.name, new rangeFinderWithData(fv.name)); } rangers[fv.name].Learn(setCase.data.featureVectors[cl.classID][fv]); } } } foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { dr["FVRange" + fv.name] = rangers[fv.name].doubleEntries.GetStdDeviation(false); dr["CFV_Ratio" + fv.name] = rangers[fv.name].GetPositionInRange(setCase.data.featureVectors[setClass.classID][fv]); // output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FV Metrics"); } // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS"); } } output.Rows.Add(dr); return(dr); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log) { List <string> selectedTerms = context.selectedFeatures.GetKeys(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); foreach (var entry in context.items) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); documentDictionarties.Add(entry.AssignedID, documentWeights); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null; if (groupmode == ScoreComputationModeEnum.category) { Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true); relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log); if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN)) { assignIDByLabel.Remove(SpaceLabel.UNKNOWN); } log.log("... Page Similarity ... Groups by category"); } else if (groupmode == ScoreComputationModeEnum.site) { relative_groups = context.GetByDomain(log); log.log("... Page Similarity ... Groups by site"); } else if (groupmode == ScoreComputationModeEnum.dataset) { relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >(); relative_groups.Add("dataset", context.items); log.log("... Page Similarity ... dataset"); } ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>(); foreach (var domainPair in relative_groups) { List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList(); foreach (var entry in relatives) { i++; FeatureVector fv = new FeatureVector(entry.AssignedID); // List<Double> d = new List<>(); fv.dimensions = new double[relatives.Count - 1]; // List<String> keys = documentDictionarties.Keys.ToList(); Int32 hostInd = relatives.IndexOf(entry); Int32 c = 0; //foreach (var pair in documentDictionarties) //{ Parallel.ForEach(relatives, (pair) => { Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); if (ind >= hostInd) { ind = ind - 1; } if (pair.AssignedID != entry.AssignedID) { Double docToClassSimilarity = 0; if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; } else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; } else { var vecA = documentDictionarties[pair.AssignedID]; var vecB = documentDictionarties[entry.AssignedID]; docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); if (docToClassSimilarity > 0) { } if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); } else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); } } fv.dimensions[ind] = docToClassSimilarity; } }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(domainPair.Key).Add(fv, -1); } } log.log("... Preparation finished ..."); return(dict); }
public void Compute() { sitesPerClass = sites.GetRatio(classes); pagesPerSite = pages.GetRatio(sites); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <param name="table">The table.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <returns></returns> public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false) { List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; /* * if (cntPair.flagBag.Contains(cnt_containerType.link)) * { * termFrequency += settings.anchorTextFactor; * } * else if (cntPair.flagBag.Contains(cnt_containerType.title)) * { * termFrequency += settings.titleTextFactor; * } * else * { * termFrequency += settings.contentTextFactor; * }*/ // lemma.otherForms.AddUnique(cntPair.initialForm); } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]"); break; } } } // table.WriteOnlyMode = false; recompute(table, logger, forSingleWebSite, lemmas); // table.ReadOnlyMode = true; return(table); }
/// <summary> /// Builds vectors from selected features and feature weighting model /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false) { List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList(); //FV.AddRange(); log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "] "); // preparing the model weightModel.PrepareTheModel(context.spaceModel, log); // blanking anything existing in vector space context.vectorSpace = new VectorSpace(); List <SpaceDocumentModel> toBlendIntoVectors = DocumentBlenderFunctionExtension.GetDocumentToBlend(blender.options, context.spaceModel.documents, log); Int32 i = 0; Int32 s = toBlendIntoVectors.Count() / 5; Dictionary <String, List <VectorDocument> > labelToDocumentSets = new Dictionary <String, List <VectorDocument> >(); foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { labelToDocumentSets.Add(catModel.name, new List <VectorDocument>()); } Int32 unlabeled = 0; foreach (SpaceDocumentModel model in toBlendIntoVectors) { VectorDocument docVec = model.BlendToVector <VectorDocument>(weightModel, context.spaceModel, FV); //new VectorDocument(model.name); context.vectorSpace.documents.Add(docVec); if (constructCategories) { String l = model.labels.FirstOrDefault(); if (!l.isNullOrEmpty()) { if (labelToDocumentSets.ContainsKey(l)) { labelToDocumentSets[l].Add(docVec); } else { unlabeled++; // } } } if (i % s == 0) { Double r = i.GetRatio(context.spaceModel.documents.Count); log.log("Blending primary vectors [" + r.ToString("P2") + "] : [" + i + "/" + toBlendIntoVectors.Count + "]"); } i++; } if (constructCategories && (unlabeled > 0)) { log.log("Vectors [" + unlabeled + "] are unlabeled"); } if (constructCategories) { log.log(":: Creating VectorSpace instances for categories"); // building category VSM foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { VectorLabel catVec = new VectorLabel(catModel.name); foreach (var docVec in labelToDocumentSets[catModel.name]) { catVec.terms.Merge(docVec.terms); } //= catModel.BlendToVector<VectorLabel>(weightModel, context.spaceModel, FV); //weightModel.GetWeights(FV, catModel, context.spaceModel); context.vectorSpace.labels.Add(catVec); } } if (weightModel != null) { weightModel.Dispose(); } }
///// <summary> ///// Transforms to fv dictionary. ///// </summary> ///// <param name="context">The context.</param> ///// <param name="TermWeightModel">The term weight model.</param> ///// <param name="function">The function.</param> ///// <returns></returns> //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) //{ // log.log("... Page Similarity ..."); // List<string> selectedTerms = context.selectedFeatures.GetKeys(); // var ByDomain = context.GetByDomain(log); // Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true); // var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log); // Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>(); // Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>(); // foreach (var entry in context.items) // { // WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); // documentDictionarties.Add(entry.AssignedID, documentWeights); // } // FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); // Double total = context.Count; // Int32 i = 0; // Int32 p = (context.Count / 10); // //List<List<Double>> matrix = new List<List<double>>(); // //foreach (var entry in context.items) // //{ // // matrix.Add(new List<double>()); // //} // //for (int x = 0; x < context.items.Count; x++) // //{ // // for (int y = 0; y < context.items.Count; x++) // // { // // } // //} // ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>(); // foreach (var domainPair in ByCategory) // { // List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList(); // foreach (var entry in relatives) // { // i++; // FeatureVector fv = new FeatureVector(entry.AssignedID); // // List<Double> d = new List<>(); // fv.dimensions = new double[relatives.Count - 1]; // // List<String> keys = documentDictionarties.Keys.ToList(); // Int32 hostInd = relatives.IndexOf(entry); // Int32 c = 0; // //foreach (var pair in documentDictionarties) // //{ // Parallel.ForEach(relatives, (pair) => // { // Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); // if (ind >= hostInd) // { // ind = ind - 1; // } // if (pair.AssignedID != entry.AssignedID) // { // Double docToClassSimilarity = 0; // if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; // } // else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; // } // else // { // var vecA = documentDictionarties[pair.AssignedID]; // var vecB = documentDictionarties[entry.AssignedID]; // docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); // if (docToClassSimilarity > 0) // { // } // if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); // //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); // } // else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); // } // } // fv.dimensions[ind] = docToClassSimilarity; // } // }); // Int32 r = i % p; // if (r == 0) // { // log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); // } // dict.GetOrAdd(entry.DomainID).Add(fv, -1); // } // } // //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict) // //{ // // pair.Value.CloseDeploy(); // //} // log.log("... Preparation finished ..."); // return dict; //} /// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Site Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); var byDomain = context.GetByDomain(log); FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); foreach (var pair in byDomain) { i++; SpaceDocumentModel siteModel = new SpaceDocumentModel(); foreach (var ent in pair.Value) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel); documentDictionarties.Add(ent.AssignedID, documentWeights); siteModel.Children.Add(ent.spaceDocument); //siteModel.terms.MergeDictionary(ent.spaceDocument.terms); } siteModel.Flatten(false); categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel)); foreach (var ent in pair.Value) { FeatureVector fv = new FeatureVector(ent.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; // documentDictionarties[ent.AssignedID].entries var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]); fv.dimensions[0] = docToClassSimilarity; dict.GetOrAdd(pair.Key).Add(fv, -1); } Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation finished ..."); return(dict); }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="source">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <param name="counter">The counter.</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null) { if (counter == null) { counter = prepareCounter(source); } List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]"); break; } } } recompute(table, logger, forSingleWebSite, lemmas); return(table); }
/// <summary> /// Gets the index of the Dice coefficient: number of common ngrams divided by number of n-grams in both sets /// </summary> /// <param name="ngrams_A">The ngrams a.</param> /// <param name="ngrams_b">The ngrams b.</param> /// <returns></returns> public static Double GetDiceCoefficient(List <String> ngrams_A, List <String> ngrams_b) { Int32 common = ngrams_A.Count(x => ngrams_b.Contains(x)) * 2; return(common.GetRatio(ngrams_A.Count + ngrams_b.Count)); }
/// <summary> /// Gets the index of the Dice coefficient: number of common ngrams divided by number of n-grams in both sets /// </summary> /// <param name="ngrams_A">The ngrams a.</param> /// <param name="ngrams_B">The ngrams b.</param> /// <returns></returns> public Double GetDiceCoefficient(List <T> ngrams_A, List <T> ngrams_B) { Int32 common = CountContains(ngrams_A, ngrams_B) * 2; //.Count(x => Contains(ngrams_B, x)) * 2; return(common.GetRatio(ngrams_A.Count + ngrams_B.Count)); }