public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            JunctionGraphMetrics <NodeGraph> junctionGraphMetrics = new JunctionGraphMetrics <NodeGraph>();

            junctionGraphMetrics.Process(results.CurrentGraph);

            var junctionBlocks = junctionGraphMetrics.GetJunctionBlocks(1, 4, true, true);

            List <JunctionPoint <NodeGraph> > sorted = junctionBlocks.OrderByDescending(x => x.JunctionSize).ToList();

            sorted = sorted.OrderByDescending(x => x.XPathRoot.getPathParts().Count).ToList();



            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            foreach (var junctionPoint in sorted)
            {
                NodeGraph             childNode    = results.CurrentGraph.GetChildAtPath(junctionPoint.XPathRoot, "/", false);
                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, childNode);
                newCandidate.MetaData = junctionPoint;
                output.Add(newCandidate);
            }

            results.Candidates.AddRange(output);
            return(output);
        }
示例#2
0
        public ChunkDetectionResult Run(NodeGraph InitialGraph)
        {
            Check();



            ChunkDetectionResult result = new ChunkDetectionResult(InitialGraph, this);


            foreach (IContentChunkDetector detector in Detectors)
            {
                detector.GetCandidates(result);
            }

            result.Candidates.ScoreAndSort();

            NodeDictionaryGraphStyleSettings style = new NodeDictionaryGraphStyleSettings();


            result.CurrentGraphStates.Add(result.CurrentGraph.BuildDirectedGraph(style));

            foreach (ChunkContentCandidate candidate in result.Candidates)
            {
                //if (candidate.Score == 0)
                //{
                //    result.DeclinedCandidates.Add(candidate);
                //    continue;
                //}
                var rootNode = result.CurrentGraph.GetChildAtPath <NodeGraph>(candidate.Node.path, "/", false);
                if (rootNode == null)
                {
                    result.DeclinedCandidates.Add(candidate);
                }
                else
                {
                    ContentChunk contentChunk = new ContentChunk();
                    contentChunk.ExtractorName = candidate.Detector.GetExtractorName();
                    contentChunk.DeployRootNode(rootNode);

                    var subGraph = rootNode.GetSubgraph(true);
                    contentChunk.SubGraph = subGraph;

                    if (candidate.Detector.SetContentChunk(contentChunk, candidate, result))
                    {
                        result.DetectedChunks.Add(contentChunk);
                        result.AcceptedCandidates.Add(candidate);
                    }
                    else
                    {
                        result.DeclinedByDetectorCandidates.Add(candidate);
                    }

                    result.CurrentGraphStates.Add(result.CurrentGraph.BuildDirectedGraph(style));
                }
            }


            return(result);
        }
        public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            var tableChildren = results.CurrentGraph.getAllChildren(HtmlPathSelectExpression);

            foreach (NodeGraph tableChild in tableChildren)
            {
                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, tableChild);
                output.Add(newCandidate);
            }

            results.Candidates.AddRange(output);
            return(output);
        }
示例#4
0
        public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            var inputGraph = results.InitialGraph; //.CloneByItems();

            List <NodeGraph> peaks = new List <NodeGraph>();
            ListDictionary <String, NodeGraph> dynamicNodes = new ListDictionary <string, NodeGraph>();

            var       dynamicNodeList = inputGraph.GetChildrenWithItemSet().Where(x => x.item.Category.HasFlag(NodeInTemplateRole.Dynamic)).ToList();
            NodeGraph dynamicNode     = dynamicNodeList.FirstOrDefault();
            Int32     i       = 0;
            Int32     i_limit = 5000;

            while (dynamicNode != null)
            {
                var       nodePeakSearch = new JunctionPeakSearch(dynamicNode);
                NodeGraph peakNode       = nodePeakSearch.GetJunctionPeak(Convert.ToDouble(MinJunctionSize));

                dynamicNodes[dynamicNode.path].Add(dynamicNode);

                if (dynamicNodes[dynamicNode.path].Count > 1)
                {
                }

                if (peakNode == null)
                {
                    //dynamicNode.removeFromParent();
                }
                else
                {
                    if (peaks.Contains(peakNode))
                    {
                    }
                    else
                    {
                        peaks.Add(peakNode);

                        if (peakNode.level < MinPeakNodeLevel)
                        {
                            //      dynamicNode.removeFromParent();
                        }
                        else
                        {
                            //var peakNodeAtSource = results.InitialGraph.GetChildAtPath(peakNode.path, "/", false);

                            if (peakNode.Count() != 0)
                            {
                                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, peakNode);

                                //    peakNode.removeFromParent();
                                output.Add(newCandidate);
                            }
                            else
                            {
                            }
                        }
                    }
                }
                i++;


                dynamicNode = dynamicNodeList.FirstOrDefault(x => dynamicNodes[x.path].Count == 0);  // inputGraph.GetChildrenWithItemSet().FirstOrDefault(x => (x.item.Category.HasFlag(NodeInTemplateRole.Dynamic) && dynamicNodes[x.path].Count == 0));

                if (i > i_limit)
                {
                    break;
                }
            }
            if (peaks.Count > 0)
            {
            }

            results.Candidates.AddRange(output);
            return(output);
        }
示例#5
0
        public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(candidate.Node);

            chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables;
            chunk.type            = ContentChunkType.DynamicDataExtraction;


            if (templateDetection.IsValid())
            {
                RecordTemplateSet templateSet = templateDetection.GetTemplateSet();
                chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet);

                return(true);
            }
            else
            {
                chunk.description = "Refused for failed template detection";

                return(false);
            }
        }
        public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            JunctionPoint <NodeGraph>  junctionPoint     = candidate.MetaData as JunctionPoint <NodeGraph>;
            NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(junctionPoint.rootItem);

            if (templateDetection.IsValid())
            {
                RecordTemplateSet templateSet = templateDetection.GetTemplateSet();
                chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet);

                return(true);
            }
            else
            {
                return(false);
            }
        }
示例#7
0
 public abstract ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results);
示例#8
0
 public abstract Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result);
示例#9
0
        public override Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables;
            chunk.type            = ContentChunkType.DynamicDataExtraction;

            return(true);
        }