Beispiel #1
0
 public KnowledgeGraphNode(int index, OriginalGraphType originalGraphType, string label, string htmlName, Uri linkToPage)
 {
     this.Index             = index;
     this.OriginalGraphType = originalGraphType;
     this.Label             = label;
     this.HtmlName          = htmlName;
     this.LinkToPage        = linkToPage;
     this.Neighbors         = new List <KnowledgeGraphNode>();
 }
        private static void ExtractAndAddUlSubgraphRecursive(KnowledgeGraph graph, int parentIndex, HtmlNode nodeToParse, ref int nodeToParseIndex, OriginalGraphType originalGraphType)
        {
            foreach (HtmlNode child in nodeToParse.ChildNodes.Where(n => n.Name == "li"))
            {
                Uri nodeLinkToPage = GetUriFromNode(child);

                // and add it as a parent, too
                graph.KnGraph.Add(new KnowledgeGraphNode(index: nodeToParseIndex, originalGraphType: originalGraphType, label: GetLiNodeLabel(child), htmlName: "li", linkToPage: nodeLinkToPage));

                // 1 li can only have 1 ul in it
                HtmlNode ulNode = child.ChildNodes.Where(node => node.Name == "ul").FirstOrDefault();

                // if the li has an ul in it, go into recursive
                if (ulNode != null)
                {
                    // add the li node to the graph because it will be the parent of its ul's items and increase the index counter
                    graph.KnGraph[parentIndex].Neighbors.Add(new KnowledgeGraphNode(index: nodeToParseIndex, originalGraphType: originalGraphType, label: GetLiNodeLabel(child), htmlName: "li", linkToPage: nodeLinkToPage));

                    int newParentNodeIndex = nodeToParseIndex;
                    nodeToParseIndex++;
                    ExtractAndAddUlSubgraphRecursive(
                        graph: graph,
                        parentIndex: newParentNodeIndex,
                        nodeToParse: ulNode,
                        nodeToParseIndex: ref nodeToParseIndex,
                        originalGraphType: originalGraphType);
                }
                else
                {
                    graph.KnGraph[parentIndex].Neighbors.Add(new KnowledgeGraphNode(index: nodeToParseIndex, originalGraphType: originalGraphType, label: GetLiNodeLabel(child), htmlName: "li", linkToPage: nodeLinkToPage));
                    nodeToParseIndex++;
                }
            }
        }
        private static void ParseHtmlNodesIntoKnGraph(KnowledgeGraph graph, List <HtmlNode> htmlNodes, OriginalGraphType originalGraphType = OriginalGraphType.Unknown)
        {
            if (graph == null)
            {
                graph = new KnowledgeGraph();
            }

            // resume node index from the given graph. if empty, it starts at 0
            int nodeIndex = graph.KnGraph.Count;

            // you can rely that the nodes are in order h2 > h3 > h4 however ul can come at any point
            // and there are no duplicate pieces of knowledge (I mean you'll see more h2, h3, h4 and ul but they each correspond to a different piece of knowledge)
            // list with the last index of h2, h3, and h4 - compute index of this list by taking the h's number - 2
            List <int> mostRecentHIndexes = new List <int>()
            {
                nodeIndex, nodeIndex, nodeIndex
            };
            // I need this variable to store the last known h for the ul elements since I can't know which h was laste just with the list
            int mostRecentHIndex = nodeIndex;

            foreach (var node in htmlNodes)
            {
                if (node.Name == "ul")
                {
                    // ul has more nodes in it which need to be indexed and the subtree added here
                    // the ul itself is not a node but a list of nodes, ul is just a placeholder
                    ExtractAndAddUlSubgraphRecursive(graph: graph, parentIndex: mostRecentHIndex, nodeToParse: node, nodeToParseIndex: ref nodeIndex, originalGraphType: originalGraphType);
                }
                else if (node.Name.StartsWith("h"))
                {
                    string nodeLabel      = GetNodeHeadlineText(node);
                    Uri    nodeLinkToPage = GetUriFromNode(node);

                    // stop processing nodes once you hit the see also node because we don't want any of the info after see also in the graph
                    // which is "see also"'s child nodes and the references section
                    if (string.Equals(nodeLabel, "see also", StringComparison.InvariantCultureIgnoreCase))
                    {
                        break;
                    }

                    // only add h's and li's to the graph, not ul's
                    graph.KnGraph.Add(new KnowledgeGraphNode(nodeIndex, originalGraphType, nodeLabel, node.Name, nodeLinkToPage));

                    mostRecentHIndex = nodeIndex;

                    // turn the h number into int and -2 to get the index for mostRecentHIndexes
                    int hIndex = int.Parse(node.Name[1].ToString()) - 2;
                    mostRecentHIndexes[hIndex] = nodeIndex;

                    // if it's h3 then add it to the most recent h2 and if it's h4 add it to the most recent h3 and so on if hN add it to h(N-1)
                    if (hIndex > 0)
                    {
                        graph.KnGraph[mostRecentHIndexes[hIndex - 1]].Neighbors.Add(new KnowledgeGraphNode(nodeIndex, originalGraphType, nodeLabel, node.Name, nodeLinkToPage));
                    }

                    nodeIndex++;
                }
            }
        }