コード例 #1
0
        private static void SetBlockAnnotation(Document doc, UrlTree.NodeInfo[] result, HeuristicsType hType, int i, string pathInfo, TextBlock textBlock)
        {
            UrlTree.NodeInfo    firstNode  = result[0];
            Pair <bool, string> heurResult = BpHeuristics(result, i, hType);

            if (heurResult.First)
            {
                textBlock.Annotation.Type = "TextBlock/Boilerplate";
            }
            else if (firstNode.TextBlockCounts[i] == 0)
            {
                textBlock.Annotation.Type = "TextBlock/Content/Unseen";
            }
            else
            {
                textBlock.Annotation.Type = "TextBlock/Content";
            }
            textBlock.Annotation.Features.SetFeatureValue("bprNodeBlockCount", firstNode.TextBlockCounts[i].ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprNodeLocation", firstNode.NodeLocation.ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprNodeDocumentCount", firstNode.NodeDocumentCount.ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprUrlPart", firstNode.UrlPart);
            textBlock.Annotation.Features.SetFeatureValue("bprPathInfo", pathInfo);
            if (hType != HeuristicsType.Simple)
            {
                textBlock.Annotation.Features.SetFeatureValue("bprContentVsBoileplateVotes", heurResult.Second);
            }
        }
コード例 #2
0
        private static void SetBlockAnnotation(Document doc, UrlTree.NodeInfo[] result, HeuristicsType hType, int i, string pathInfo, TextBlock textBlock)
        {
            UrlTree.NodeInfo    firstNode  = result[0];
            Pair <bool, string> heurResult = BpHeuristics(result, i, hType);
            Set <string>        domPath    = new Set <string>(textBlock.Annotation.Features.GetFeatureValue("domPath").Split('/'));

            if (heurResult.First || IsLink(textBlock.Annotation.Features.GetFeatureValue("linkToTextRatio")) || Set <string> .Intersection(domPath, mSkipTags).Count > 0)
            {
                textBlock.Annotation.Type = "TextBlock/Boilerplate";
            }
            else if (firstNode.TextBlockCounts[i] == 0)
            {
                textBlock.Annotation.Type = "TextBlock/Content/Unseen";
            }
            else
            {
                textBlock.Annotation.Type = "TextBlock/Content";
            }
            textBlock.Annotation.Features.SetFeatureValue("bprNodeBlockCount", firstNode.TextBlockCounts[i].ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprNodeLocation", firstNode.NodeLocation.ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprNodeDocumentCount", firstNode.NodeDocumentCount.ToString());
            textBlock.Annotation.Features.SetFeatureValue("bprUrlPart", firstNode.UrlPart);
            textBlock.Annotation.Features.SetFeatureValue("bprPathInfo", pathInfo);
            if (hType != HeuristicsType.Simple)
            {
                textBlock.Annotation.Features.SetFeatureValue("bprContentVsBoileplateVotes", heurResult.Second);
            }
        }