protected Resource Parse(Resource resource) { if (resource.ContentType.ToLower ().Contains ("html")) { using (Stream stream = System.IO.File.OpenRead(FullPath)) using (StreamReader streamReader = new StreamReader(stream)) { SgmlParser parser = new SgmlParser (); XmlDocument xmlDom = parser.ParseSgml (streamReader); Stack<XmlNode> nodes = new Stack<XmlNode> (); foreach (XmlNode node in xmlDom.ChildNodes) { nodes.Push (node); } while (nodes.Count > 0) { var node = nodes.Pop (); switch (node.NodeType) { case XmlNodeType.None: case XmlNodeType.XmlDeclaration: case XmlNodeType.ProcessingInstruction: case XmlNodeType.DocumentFragment: case XmlNodeType.Whitespace: case XmlNodeType.Entity: case XmlNodeType.DocumentType: case XmlNodeType.Document: case XmlNodeType.Comment: case XmlNodeType.Notation: case XmlNodeType.SignificantWhitespace: case XmlNodeType.EndElement: case XmlNodeType.EndEntity: case XmlNodeType.EntityReference: case XmlNodeType.Attribute: default: continue; case XmlNodeType.Element: // split into words and get the x-path to the node if (node.Value == null) break; string[] words = node.Value.Split (new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); string xpath = GetXPathToNode (node); System.Diagnostics.Debug.WriteLine (xpath + string.Join ("-", words)); break; case XmlNodeType.Text: break; case XmlNodeType.CDATA: break; } foreach (XmlNode innerNode in node.ChildNodes) { nodes.Push (innerNode); } } } } else { } #region Old?! // // // // Look for URIs which are already crawled // foreach (DataRow row in ResourceCache.Resources.Rows) { // // if (!Regex.IsMatch (content, (row ["Uri"] as string) + "\\W")) // continue; // // if (!resource.ReferencedResources.Contains (row ["Uri"]as string)) // resource.ReferencedResources.Add (row ["Uri"] as string); // } // // // Look for already crawled class- and instance-names // foreach (DataRow row in NodesCache.NodesTable.Rows) { // // if (!Regex.IsMatch (content, (row ["Label"] as string) + "\\W")) // continue; // // if (row ["Type"] as string == "Class") // resource.ReferencedClasses.Add (row ["Id"] as string); // else if (row ["Type"] as string == "Instance") // resource.ReferencedObjects.Add (row ["Id"] as string); // } #endregion return resource; }
protected Resource Parse(Resource resource) { if (resource.ContentType.ToLower().Contains("html")) { using (Stream stream = System.IO.File.OpenRead(FullPath)) using (StreamReader streamReader = new StreamReader(stream)) { SgmlParser parser = new SgmlParser(); XmlDocument xmlDom = parser.ParseSgml(streamReader); Stack <XmlNode> nodes = new Stack <XmlNode> (); foreach (XmlNode node in xmlDom.ChildNodes) { nodes.Push(node); } while (nodes.Count > 0) { var node = nodes.Pop(); switch (node.NodeType) { case XmlNodeType.None: case XmlNodeType.XmlDeclaration: case XmlNodeType.ProcessingInstruction: case XmlNodeType.DocumentFragment: case XmlNodeType.Whitespace: case XmlNodeType.Entity: case XmlNodeType.DocumentType: case XmlNodeType.Document: case XmlNodeType.Comment: case XmlNodeType.Notation: case XmlNodeType.SignificantWhitespace: case XmlNodeType.EndElement: case XmlNodeType.EndEntity: case XmlNodeType.EntityReference: case XmlNodeType.Attribute: default: continue; case XmlNodeType.Element: // split into words and get the x-path to the node if (node.Value == null) { break; } string[] words = node.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); string xpath = GetXPathToNode(node); System.Diagnostics.Debug.WriteLine(xpath + string.Join("-", words)); break; case XmlNodeType.Text: break; case XmlNodeType.CDATA: break; } foreach (XmlNode innerNode in node.ChildNodes) { nodes.Push(innerNode); } } } } else { } #region Old?! // // // // Look for URIs which are already crawled // foreach (DataRow row in ResourceCache.Resources.Rows) { // // if (!Regex.IsMatch (content, (row ["Uri"] as string) + "\\W")) // continue; // // if (!resource.ReferencedResources.Contains (row ["Uri"]as string)) // resource.ReferencedResources.Add (row ["Uri"] as string); // } // // // Look for already crawled class- and instance-names // foreach (DataRow row in NodesCache.NodesTable.Rows) { // // if (!Regex.IsMatch (content, (row ["Label"] as string) + "\\W")) // continue; // // if (row ["Type"] as string == "Class") // resource.ReferencedClasses.Add (row ["Id"] as string); // else if (row ["Type"] as string == "Instance") // resource.ReferencedObjects.Add (row ["Id"] as string); // } #endregion return(resource); }