Esempio n. 1
0
        public ValidationOutcome Compute()
        {
            List <MetaTable> MetaTables = new List <MetaTable>();
            List <TableExtractionTaskScoreEntry> ValidationRuns = new List <TableExtractionTaskScoreEntry>();



            foreach (TableExtractionTaskScoreEntry run in task.score.TaskRuns)
            {
                executionCalls.Count(run.executionMode);
                switch (run.executionMode)
                {
                default:
                    break;

                case ExtractionTaskEngineMode.Validation:
                    ValidationRunCount++;
                    ValidationRuns.Add(run);
                    if (run.MetaTableCreated)
                    {
                        MetaTables.AddRange(run.metaTable.Where(x => x.IsValid));
                    }
                    else
                    {
                    }
                    break;
                }
            }

            if (!MetaTables.Any())
            {
                return(SetOutcome(ValidationOutcome.Invalid, "Failed to produce any meta table"));
            }

            ValidMetaTables = MetaTables.Count;

            foreach (MetaTable table in MetaTables)
            {
                PropertyCount.Learn(table.properties.Count);
                EntryCount.Learn(table.entries.Count);
                PropertyValidation.CollectProperties(table);

                var unresolved = PropertyValidation.GetUnresolved();

                foreach (MetaTableEntry entry in table.entries.items)
                {
                    foreach (MetaTableProperty property in table.properties.items)
                    {
                        metaPropertyNameCounter.Count(property.PropertyName);

                        if (unresolved.ContainsKey(property.PropertyName))
                        {
                            var prop_validation = unresolved[property.PropertyName];

                            if (property.PropertyName.isStartWithNumber())
                            {
                                prop_validation.SetOutcome(ValidationOutcome.Invalid, "Property name [" + property.PropertyName + "] starts with number");
                                continue;
                            }

                            prop_validation.ValueCounter.Count(entry.GetStoredValue(property));

                            if (entry.HasLinkedCell(property))
                            {
                                var sourceCell = entry.GetLinkedCell(property);
                                if (sourceCell.SourceNode != null)
                                {
                                    if (prop_validation.XPath.isNullOrEmpty())
                                    {
                                        prop_validation.XPath = sourceCell.SourceNode.XPath;
                                    }
                                    if (sourceCell.SourceNode.HasChildNodes)
                                    {
                                        List <HtmlNode> descendant = sourceCell.SourceNode.DescendantNodes().ToList();


                                        if (descendant.Count(x => !x.Name.StartsWith("#")) > 1)
                                        {
                                            String signature = descendant.Select(x => x.Name).toCsvInLine();

                                            prop_validation.reporter.AppendLine(sourceCell.SourceCellXPath);
                                            prop_validation.reporter.AppendLine(sourceCell.SourceNode.OuterHtml);

                                            prop_validation.SetOutcome(ValidationOutcome.Invalid, "Source cell contains [" + descendant.Count + "] descendant nodes: " + signature);
                                            continue;
                                        }
                                        else
                                        {
                                        }
                                    }
                                }
                                else
                                {
                                    prop_validation.LinkedNodes++;
                                }
                            }
                        }
                    }
                }
            }

            OutputType = TaskOutputType.data;
            if (EntryCount.Range == 0)
            {
                OutputType |= TaskOutputType.fixedEntityCount;
                if (EntryCount.Maximum == 1)
                {
                    OutputType |= TaskOutputType.singleEntity;
                }
            }
            else
            {
                OutputType |= TaskOutputType.variableEntityCount;
            }

            if (PropertyCount.Range == 0)
            {
                OutputType |= TaskOutputType.fixedPropertyCount;
            }
            else
            {
                OutputType |= TaskOutputType.variablePropertyCount;
            }

            if (OutputType.HasFlag(TaskOutputType.unstableEntityAndPropertyCounts))
            {
                SetOutcome(ValidationOutcome.Invalid, "Extraction result is unstable");
            }

            var prop_unresolved = PropertyValidation.GetUnresolved().Values.ToList();

            foreach (var prop_validation in prop_unresolved)
            {
                prop_validation.Frequency           = metaPropertyNameCounter.GetFrequencyForItem(prop_validation.item.PropertyName);
                prop_validation.DistinctValues      = prop_validation.ValueCounter.DistinctCount();
                prop_validation.SpamPropertyMeasure = 1 - prop_validation.Frequency.GetRatio(metaPropertyNameCounter.GetTopFrequency());

                /*
                 * if (prop_validation.DistinctValues == 1)
                 * {
                 *  prop_validation.SetOutcome(ValidationOutcome.Invalid, "Property had only one distinct value");
                 *  continue;
                 * }*/

                prop_validation.SetOutcome(ValidationOutcome.Validated, "");
            }



            var prop_resulrs = PropertyValidation.GetResults();



            if (prop_resulrs[ValidationOutcome.Invalid].Any())
            {
                SetOutcome(ValidationOutcome.Invalid, "[" + prop_resulrs[ValidationOutcome.Invalid].Count + "] invalid properties detected");
            }

            task.PropertyDictionary.CollectProperties(prop_resulrs[ValidationOutcome.Validated]);


            Outcome = ValidationOutcome.Validated;

            return(Outcome);
        }