private static string GetPathInfo(UrlTree.NodeInfo[] result, int i) { string pathInfo = ""; foreach (UrlTree.NodeInfo nodeInfo in result) { pathInfo += nodeInfo.UrlPart + ": " + nodeInfo.TextBlockCounts[i] + "/" + nodeInfo.NodeDocumentCount + ", "; } return pathInfo.TrimEnd(' ', ','); }
private static void SetBlockAnnotation(Document doc, UrlTree.NodeInfo[] result, HeuristicsType hType, int i, string pathInfo, TextBlock textBlock) { UrlTree.NodeInfo firstNode = result[0]; Pair<bool, string> heurResult = BpHeuristics(result, i, hType); if (heurResult.First) { textBlock.Annotation.Type = "TextBlock/Boilerplate"; } else if (firstNode.TextBlockCounts[i] == 0) { textBlock.Annotation.Type = "TextBlock/Content/Unseen"; } else { textBlock.Annotation.Type = "TextBlock/Content"; } textBlock.Annotation.Features.SetFeatureValue("bprNodeBlockCount", firstNode.TextBlockCounts[i].ToString()); textBlock.Annotation.Features.SetFeatureValue("bprNodeLocation", firstNode.NodeLocation.ToString()); textBlock.Annotation.Features.SetFeatureValue("bprNodeDocumentCount", firstNode.NodeDocumentCount.ToString()); textBlock.Annotation.Features.SetFeatureValue("bprUrlPart", firstNode.UrlPart); textBlock.Annotation.Features.SetFeatureValue("bprPathInfo", pathInfo); if (hType != HeuristicsType.Simple) { textBlock.Annotation.Features.SetFeatureValue("bprContentVsBoileplateVotes", heurResult.Second); } }
private static Pair<bool, string> BpHeuristics(UrlTree.NodeInfo[] result, int i, HeuristicsType type) { if (type == HeuristicsType.Simple) { return result[0].TextBlockCounts[i] > 1 ? new Pair<bool, string>(true, null) : new Pair<bool, string>(false, null); } else { int voters = 0; foreach (UrlTree.NodeInfo nodeInfo in result) { if ((nodeInfo.NodeLocation & UrlTree.NodeLocation.WithinTld) != 0 || (nodeInfo.NodeLocation & UrlTree.NodeLocation.Root) != 0) { break; } voters++; } if (voters == 0) { voters = 1; } int bp = 0; int ct = 0; for (int j = 0; j < voters; j++) { if (type == HeuristicsType.Slow) { if (result[j].TextBlockCounts[i] > ((result[j].NodeDocumentCount / 100) + 1)) { bp += 1; } else { ct += 1; } } else if (type == HeuristicsType.Fast) { if (result[j].TextBlockCounts[i] > ((result[j].NodeDocumentCount / 50) + 1)) { bp += 1; } else { ct += 1; } } } if (bp == ct) { string outStr = string.Format(@"{0} : {1}", ct, bp); return type == HeuristicsType.Slow ? new Pair<bool, string>(true, outStr) : new Pair<bool, string>(false, outStr); } else { string outStr = string.Format(@"{0} : {1}", ct, bp); return bp > ct ? new Pair<bool, string>(true, outStr) : new Pair<bool, string>(false, outStr); } } }