public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            JunctionGraphMetrics <NodeGraph> junctionGraphMetrics = new JunctionGraphMetrics <NodeGraph>();

            junctionGraphMetrics.Process(results.CurrentGraph);

            var junctionBlocks = junctionGraphMetrics.GetJunctionBlocks(1, 4, true, true);

            List <JunctionPoint <NodeGraph> > sorted = junctionBlocks.OrderByDescending(x => x.JunctionSize).ToList();

            sorted = sorted.OrderByDescending(x => x.XPathRoot.getPathParts().Count).ToList();



            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            foreach (var junctionPoint in sorted)
            {
                NodeGraph             childNode    = results.CurrentGraph.GetChildAtPath(junctionPoint.XPathRoot, "/", false);
                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, childNode);
                newCandidate.MetaData = junctionPoint;
                output.Add(newCandidate);
            }

            results.Candidates.AddRange(output);
            return(output);
        }
Example #2
0
        public override Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables;
            chunk.type            = ContentChunkType.DynamicDataExtraction;

            return(true);
        }
        public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            var tableChildren = results.CurrentGraph.getAllChildren(HtmlPathSelectExpression);

            foreach (NodeGraph tableChild in tableChildren)
            {
                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, tableChild);
                output.Add(newCandidate);
            }

            results.Candidates.AddRange(output);
            return(output);
        }
        public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            JunctionPoint <NodeGraph>  junctionPoint     = candidate.MetaData as JunctionPoint <NodeGraph>;
            NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(junctionPoint.rootItem);

            if (templateDetection.IsValid())
            {
                RecordTemplateSet templateSet = templateDetection.GetTemplateSet();
                chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet);

                return(true);
            }
            else
            {
                return(false);
            }
        }
        public static Int32 Compare(ChunkContentCandidate CandidateA, ChunkContentCandidate CandidateB)
        {
            if (CandidateA.ScoreModel == null && CandidateB.ScoreModel == null)
            {
                return(0);
            }

            if (CandidateB.ScoreModel == null)
            {
                return(1);
            }
            if (CandidateA.ScoreModel == null)
            {
                return(-1);
            }

            var pathPartsA = CandidateA.Node.path.SplitSmart("/");
            var pathPartsB = CandidateB.Node.path.SplitSmart("/");

            if (pathPartsA.Count == pathPartsB.Count)
            {
                var dynamicScoreComparison = CandidateA.ScoreModel.DynamicNodeScore.CompareTo(CandidateB.ScoreModel.DynamicNodeScore);
                if (dynamicScoreComparison == 0)
                {
                    return(CandidateA.ScoreModel.TemplateInstances.CompareTo(CandidateB.ScoreModel.TemplateInstances));
                }
                else
                {
                    return(dynamicScoreComparison);
                }
            }
            else
            {
                if (pathPartsA.Count > pathPartsB.Count)
                {
                    return(1);
                }
                else
                {
                    return(-1);
                }
            }
        }
Example #6
0
        public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result)
        {
            NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(candidate.Node);

            chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables;
            chunk.type            = ContentChunkType.DynamicDataExtraction;


            if (templateDetection.IsValid())
            {
                RecordTemplateSet templateSet = templateDetection.GetTemplateSet();
                chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet);

                return(true);
            }
            else
            {
                chunk.description = "Refused for failed template detection";

                return(false);
            }
        }
Example #7
0
        public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results)
        {
            ChunkContentCandidateCollection output = new ChunkContentCandidateCollection();

            var inputGraph = results.InitialGraph; //.CloneByItems();

            List <NodeGraph> peaks = new List <NodeGraph>();
            ListDictionary <String, NodeGraph> dynamicNodes = new ListDictionary <string, NodeGraph>();

            var       dynamicNodeList = inputGraph.GetChildrenWithItemSet().Where(x => x.item.Category.HasFlag(NodeInTemplateRole.Dynamic)).ToList();
            NodeGraph dynamicNode     = dynamicNodeList.FirstOrDefault();
            Int32     i       = 0;
            Int32     i_limit = 5000;

            while (dynamicNode != null)
            {
                var       nodePeakSearch = new JunctionPeakSearch(dynamicNode);
                NodeGraph peakNode       = nodePeakSearch.GetJunctionPeak(Convert.ToDouble(MinJunctionSize));

                dynamicNodes[dynamicNode.path].Add(dynamicNode);

                if (dynamicNodes[dynamicNode.path].Count > 1)
                {
                }

                if (peakNode == null)
                {
                    //dynamicNode.removeFromParent();
                }
                else
                {
                    if (peaks.Contains(peakNode))
                    {
                    }
                    else
                    {
                        peaks.Add(peakNode);

                        if (peakNode.level < MinPeakNodeLevel)
                        {
                            //      dynamicNode.removeFromParent();
                        }
                        else
                        {
                            //var peakNodeAtSource = results.InitialGraph.GetChildAtPath(peakNode.path, "/", false);

                            if (peakNode.Count() != 0)
                            {
                                ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, peakNode);

                                //    peakNode.removeFromParent();
                                output.Add(newCandidate);
                            }
                            else
                            {
                            }
                        }
                    }
                }
                i++;


                dynamicNode = dynamicNodeList.FirstOrDefault(x => dynamicNodes[x.path].Count == 0);  // inputGraph.GetChildrenWithItemSet().FirstOrDefault(x => (x.item.Category.HasFlag(NodeInTemplateRole.Dynamic) && dynamicNodes[x.path].Count == 0));

                if (i > i_limit)
                {
                    break;
                }
            }
            if (peaks.Count > 0)
            {
            }

            results.Candidates.AddRange(output);
            return(output);
        }
Example #8
0
 public abstract Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result);