public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results) { JunctionGraphMetrics <NodeGraph> junctionGraphMetrics = new JunctionGraphMetrics <NodeGraph>(); junctionGraphMetrics.Process(results.CurrentGraph); var junctionBlocks = junctionGraphMetrics.GetJunctionBlocks(1, 4, true, true); List <JunctionPoint <NodeGraph> > sorted = junctionBlocks.OrderByDescending(x => x.JunctionSize).ToList(); sorted = sorted.OrderByDescending(x => x.XPathRoot.getPathParts().Count).ToList(); ChunkContentCandidateCollection output = new ChunkContentCandidateCollection(); foreach (var junctionPoint in sorted) { NodeGraph childNode = results.CurrentGraph.GetChildAtPath(junctionPoint.XPathRoot, "/", false); ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, childNode); newCandidate.MetaData = junctionPoint; output.Add(newCandidate); } results.Candidates.AddRange(output); return(output); }
public ChunkDetectionResult Run(NodeGraph InitialGraph) { Check(); ChunkDetectionResult result = new ChunkDetectionResult(InitialGraph, this); foreach (IContentChunkDetector detector in Detectors) { detector.GetCandidates(result); } result.Candidates.ScoreAndSort(); NodeDictionaryGraphStyleSettings style = new NodeDictionaryGraphStyleSettings(); result.CurrentGraphStates.Add(result.CurrentGraph.BuildDirectedGraph(style)); foreach (ChunkContentCandidate candidate in result.Candidates) { //if (candidate.Score == 0) //{ // result.DeclinedCandidates.Add(candidate); // continue; //} var rootNode = result.CurrentGraph.GetChildAtPath <NodeGraph>(candidate.Node.path, "/", false); if (rootNode == null) { result.DeclinedCandidates.Add(candidate); } else { ContentChunk contentChunk = new ContentChunk(); contentChunk.ExtractorName = candidate.Detector.GetExtractorName(); contentChunk.DeployRootNode(rootNode); var subGraph = rootNode.GetSubgraph(true); contentChunk.SubGraph = subGraph; if (candidate.Detector.SetContentChunk(contentChunk, candidate, result)) { result.DetectedChunks.Add(contentChunk); result.AcceptedCandidates.Add(candidate); } else { result.DeclinedByDetectorCandidates.Add(candidate); } result.CurrentGraphStates.Add(result.CurrentGraph.BuildDirectedGraph(style)); } } return(result); }
public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results) { ChunkContentCandidateCollection output = new ChunkContentCandidateCollection(); var tableChildren = results.CurrentGraph.getAllChildren(HtmlPathSelectExpression); foreach (NodeGraph tableChild in tableChildren) { ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, tableChild); output.Add(newCandidate); } results.Candidates.AddRange(output); return(output); }
public override ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results) { ChunkContentCandidateCollection output = new ChunkContentCandidateCollection(); var inputGraph = results.InitialGraph; //.CloneByItems(); List <NodeGraph> peaks = new List <NodeGraph>(); ListDictionary <String, NodeGraph> dynamicNodes = new ListDictionary <string, NodeGraph>(); var dynamicNodeList = inputGraph.GetChildrenWithItemSet().Where(x => x.item.Category.HasFlag(NodeInTemplateRole.Dynamic)).ToList(); NodeGraph dynamicNode = dynamicNodeList.FirstOrDefault(); Int32 i = 0; Int32 i_limit = 5000; while (dynamicNode != null) { var nodePeakSearch = new JunctionPeakSearch(dynamicNode); NodeGraph peakNode = nodePeakSearch.GetJunctionPeak(Convert.ToDouble(MinJunctionSize)); dynamicNodes[dynamicNode.path].Add(dynamicNode); if (dynamicNodes[dynamicNode.path].Count > 1) { } if (peakNode == null) { //dynamicNode.removeFromParent(); } else { if (peaks.Contains(peakNode)) { } else { peaks.Add(peakNode); if (peakNode.level < MinPeakNodeLevel) { // dynamicNode.removeFromParent(); } else { //var peakNodeAtSource = results.InitialGraph.GetChildAtPath(peakNode.path, "/", false); if (peakNode.Count() != 0) { ChunkContentCandidate newCandidate = new ChunkContentCandidate(this, peakNode); // peakNode.removeFromParent(); output.Add(newCandidate); } else { } } } } i++; dynamicNode = dynamicNodeList.FirstOrDefault(x => dynamicNodes[x.path].Count == 0); // inputGraph.GetChildrenWithItemSet().FirstOrDefault(x => (x.item.Category.HasFlag(NodeInTemplateRole.Dynamic) && dynamicNodes[x.path].Count == 0)); if (i > i_limit) { break; } } if (peaks.Count > 0) { } results.Candidates.AddRange(output); return(output); }
public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result) { NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(candidate.Node); chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables; chunk.type = ContentChunkType.DynamicDataExtraction; if (templateDetection.IsValid()) { RecordTemplateSet templateSet = templateDetection.GetTemplateSet(); chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet); return(true); } else { chunk.description = "Refused for failed template detection"; return(false); } }
public override bool SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result) { JunctionPoint <NodeGraph> junctionPoint = candidate.MetaData as JunctionPoint <NodeGraph>; NodeGraphTemplateDetection templateDetection = new NodeGraphTemplateDetection(junctionPoint.rootItem); if (templateDetection.IsValid()) { RecordTemplateSet templateSet = templateDetection.GetTemplateSet(); chunk.ExtractorCustomizationSettings.AddObjectEntry(nameof(RecordTemplateExtractor.TemplateSet), templateSet); return(true); } else { return(false); } }
public abstract ChunkContentCandidateCollection GetCandidates(ChunkDetectionResult results);
public abstract Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result);
public override Boolean SetContentChunk(ContentChunk chunk, ChunkContentCandidate candidate, ChunkDetectionResult result) { chunk.multiNodePolicy = TaskMultiNodePolicy.AsSeparatedTables; chunk.type = ContentChunkType.DynamicDataExtraction; return(true); }