public ValidationOutcome Compute() { List <MetaTable> MetaTables = new List <MetaTable>(); List <TableExtractionTaskScoreEntry> ValidationRuns = new List <TableExtractionTaskScoreEntry>(); foreach (TableExtractionTaskScoreEntry run in task.score.TaskRuns) { executionCalls.Count(run.executionMode); switch (run.executionMode) { default: break; case ExtractionTaskEngineMode.Validation: ValidationRunCount++; ValidationRuns.Add(run); if (run.MetaTableCreated) { MetaTables.AddRange(run.metaTable.Where(x => x.IsValid)); } else { } break; } } if (!MetaTables.Any()) { return(SetOutcome(ValidationOutcome.Invalid, "Failed to produce any meta table")); } ValidMetaTables = MetaTables.Count; foreach (MetaTable table in MetaTables) { PropertyCount.Learn(table.properties.Count); EntryCount.Learn(table.entries.Count); PropertyValidation.CollectProperties(table); var unresolved = PropertyValidation.GetUnresolved(); foreach (MetaTableEntry entry in table.entries.items) { foreach (MetaTableProperty property in table.properties.items) { metaPropertyNameCounter.Count(property.PropertyName); if (unresolved.ContainsKey(property.PropertyName)) { var prop_validation = unresolved[property.PropertyName]; if (property.PropertyName.isStartWithNumber()) { prop_validation.SetOutcome(ValidationOutcome.Invalid, "Property name [" + property.PropertyName + "] starts with number"); continue; } prop_validation.ValueCounter.Count(entry.GetStoredValue(property)); if (entry.HasLinkedCell(property)) { var sourceCell = entry.GetLinkedCell(property); if (sourceCell.SourceNode != null) { if (prop_validation.XPath.isNullOrEmpty()) { prop_validation.XPath = sourceCell.SourceNode.XPath; } if (sourceCell.SourceNode.HasChildNodes) { List <HtmlNode> descendant = sourceCell.SourceNode.DescendantNodes().ToList(); if (descendant.Count(x => !x.Name.StartsWith("#")) > 1) { String signature = descendant.Select(x => x.Name).toCsvInLine(); prop_validation.reporter.AppendLine(sourceCell.SourceCellXPath); prop_validation.reporter.AppendLine(sourceCell.SourceNode.OuterHtml); prop_validation.SetOutcome(ValidationOutcome.Invalid, "Source cell contains [" + descendant.Count + "] descendant nodes: " + signature); continue; } else { } } } else { prop_validation.LinkedNodes++; } } } } } } OutputType = TaskOutputType.data; if (EntryCount.Range == 0) { OutputType |= TaskOutputType.fixedEntityCount; if (EntryCount.Maximum == 1) { OutputType |= TaskOutputType.singleEntity; } } else { OutputType |= TaskOutputType.variableEntityCount; } if (PropertyCount.Range == 0) { OutputType |= TaskOutputType.fixedPropertyCount; } else { OutputType |= TaskOutputType.variablePropertyCount; } if (OutputType.HasFlag(TaskOutputType.unstableEntityAndPropertyCounts)) { SetOutcome(ValidationOutcome.Invalid, "Extraction result is unstable"); } var prop_unresolved = PropertyValidation.GetUnresolved().Values.ToList(); foreach (var prop_validation in prop_unresolved) { prop_validation.Frequency = metaPropertyNameCounter.GetFrequencyForItem(prop_validation.item.PropertyName); prop_validation.DistinctValues = prop_validation.ValueCounter.DistinctCount(); prop_validation.SpamPropertyMeasure = 1 - prop_validation.Frequency.GetRatio(metaPropertyNameCounter.GetTopFrequency()); /* * if (prop_validation.DistinctValues == 1) * { * prop_validation.SetOutcome(ValidationOutcome.Invalid, "Property had only one distinct value"); * continue; * }*/ prop_validation.SetOutcome(ValidationOutcome.Validated, ""); } var prop_resulrs = PropertyValidation.GetResults(); if (prop_resulrs[ValidationOutcome.Invalid].Any()) { SetOutcome(ValidationOutcome.Invalid, "[" + prop_resulrs[ValidationOutcome.Invalid].Count + "] invalid properties detected"); } task.PropertyDictionary.CollectProperties(prop_resulrs[ValidationOutcome.Validated]); Outcome = ValidationOutcome.Validated; return(Outcome); }